From 7f908fd780942fabe8dd21bc22b901ece5013c29 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 23 Jan 2024 11:30:59 -0600
Subject: [PATCH 1/9] [macOS] Comparison between exactly same types

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/concurrency/tests/unit/tagged_ptr.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
index b29652a3ede1..d86fc5775415 100644
--- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp
+++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
@@ -25,7 +25,7 @@ void tagged_ptr_test()
         i = j;
 
         HPX_TEST_EQ(i.get_ptr(), &b);
-        HPX_TEST_EQ(i.get_tag(), 1);
+        HPX_TEST_EQ(i.get_tag(), 1UL);
     }
 
     {
@@ -43,7 +43,7 @@ void tagged_ptr_test()
 
     {
         tagged_ptr<int> j(&a, max_tag);
-        HPX_TEST_EQ(j.get_next_tag(), 0);
+        HPX_TEST_EQ(j.get_next_tag(), 0UL);
     }
 
     {

From ed7e16101662114a69abb0fcaa23e6ddfb7b93eb Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 23 Jan 2024 11:32:20 -0600
Subject: [PATCH 2/9] [macOS] Apple does not seem to have any typedef for
 unsigned long int

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/debugging/src/print.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp
index 3d7cf5da2aa0..8a01d9574853 100644
--- a/libs/core/debugging/src/print.cpp
+++ b/libs/core/debugging/src/print.cpp
@@ -57,6 +57,10 @@ namespace hpx::debug {
             std::ostream&, std::int32_t const&, int);
         template HPX_CORE_EXPORT void print_dec(
             std::ostream&, std::int64_t const&, int);
+#ifdef __APPLE__
+        template HPX_CORE_EXPORT void print_dec(
+            std::ostream&, unsigned long const&, int);
+#endif
         template HPX_CORE_EXPORT void print_dec(
             std::ostream&, std::uint64_t const&, int);
 

From b75ea85559702dde98e6d4dc5ccfcc0d2a08fb2c Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Mon, 4 Nov 2024 16:14:13 -0600
Subject: [PATCH 3/9] Start RFA impl

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           |  56 ++
 .../hpx/parallel/algorithms/detail/rfa.hpp    | 851 ++++++++++++++++++
 .../algorithms/reduce_deterministic.hpp       | 569 ++++++++++++
 .../tests/unit/algorithms/CMakeLists.txt      |   1 +
 .../unit/algorithms/reduce_deterministic.cpp  |  98 ++
 5 files changed, 1575 insertions(+)
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
 create mode 100644 libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
new file mode 100644
index 000000000000..79f78b8a858a
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -0,0 +1,56 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <hpx/config.hpp>
+#include <hpx/functional/detail/tag_fallback_invoke.hpp>
+#include <hpx/functional/invoke.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
+#include <hpx/parallel/util/loop.hpp>
+
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+#include "rfa.hpp"
+
+namespace hpx::parallel::detail {
+
+    template <typename ExPolicy>
+    struct sequential_reduce_deterministic_t final
+      : hpx::functional::detail::tag_fallback<
+            sequential_reduce_deterministic_t<ExPolicy>>
+    {
+    private:
+        template <typename InIterB, typename InIterE, typename T,
+            typename Reduce>
+        friend constexpr T tag_fallback_invoke(
+            sequential_reduce_deterministic_t, ExPolicy&&, InIterB first,
+            InIterE last, T init, Reduce&& r)
+        {
+            RFA::ReproducibleFloatingAccumulator<T> rfa;
+            rfa += init;
+            rfa.add(first, last);
+            return rfa.conv();
+        }
+    };
+
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+    template <typename ExPolicy>
+    inline constexpr sequential_reduce_deterministic_t<ExPolicy>
+        sequential_reduce_deterministic =
+            sequential_reduce_deterministic_t<ExPolicy>{};
+#else
+    template <typename ExPolicy, typename... Args>
+    HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic(
+        Args&&... args)
+    {
+        return sequential_reduce_deterministic_t<ExPolicy>{}(
+            std::forward<Args>(args)...);
+    }
+#endif
+
+}    // namespace hpx::parallel::detail
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
new file mode 100644
index 000000000000..e7413add8e7e
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -0,0 +1,851 @@
+//Reproducible Floating Point Accumulations via Binned Floating Point
+//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
+//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
+//
+//The code accomplishes several objectives:
+//
+//1. Reproducible summation, independent of summation order, assuming only a
+//   subset of the IEEE 754 Floating Point Standard
+//
+//2. Has accuracy at least as good as conventional summation, and tunable
+//
+//3. Handles overflow, underflow, and other exceptions reproducibly.
+//
+//4. Makes only one read-only pass over the summands.
+//
+//5. Requires only one parallel reduction.
+//
+//6. Uses minimal memory (6 doubles per accumulator with fold=3).
+//
+//7. Relatively easy to use
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+namespace RFA {
+    template <typename Real>
+inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
+{
+    while(exp > 0)
+    {
+        arg *= 2;
+        --exp;
+    }
+    while(exp < 0)
+    {
+        arg /= 2;
+        ++exp;
+    }
+
+    return arg;
+}
+
+    ///Class to hold a reproducible summation of the numbers passed to it
+    ///
+    ///@param ftype Floating-point data type; either `float` or `double`
+    ///@param FOLD  The fold; use 3 as a default unless you understand it.
+    template <class ftype, int FOLD = 3,
+        typename std::enable_if<std::is_floating_point<ftype>::value>::type* =
+            nullptr>
+    class ReproducibleFloatingAccumulator
+    {
+    private:
+        std::array<ftype, 2 * FOLD> data = {0};
+
+        ///Floating-point precision bin width
+        static constexpr auto BIN_WIDTH =
+            std::is_same<ftype, double>::value ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+        ///Binned floating-point compression factor
+        ///This factor is used to scale down inputs before deposition into the bin of
+        ///highest index
+        static constexpr auto COMPRESSION =
+            1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1));
+        ///Binned double precision expansion factor
+        ///This factor is used to scale up inputs after deposition into the bin of
+        ///highest index
+        static constexpr auto EXPANSION =
+            1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1));
+        static constexpr auto EXP_BIAS = MAX_EXP - 2;
+        static constexpr auto EPSILON = std::numeric_limits<ftype>::epsilon();
+        ///Binned floating-point deposit endurance
+        ///The number of deposits that can be performed before a renorm is necessary.
+        ///Applies also to binned complex double precision.
+        static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
+
+        ///Generates binned floating-point reference bins
+        static constexpr std::array<ftype, MAXINDEX + MAXFOLD> initialize_bins()
+        {
+            std::array<ftype, MAXINDEX + MAXFOLD> bins{0};
+
+            if (std::is_same<ftype, float>::value)
+            {
+                bins[0] = ldexp_impl(0.75, MAX_EXP);
+            }
+            else
+            {
+                bins[0] = 2.0 * ldexp_impl(0.75, MAX_EXP - 1);
+            }
+
+            for (int index = 1; index <= MAXINDEX; index++)
+            {
+                bins[index] = ldexp_impl(0.75,
+                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
+            }
+            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
+            {
+                bins[index] = bins[index - 1];
+            }
+
+            return bins;
+        }
+
+        ///The binned floating-point reference bins
+        static constexpr auto bins = initialize_bins();
+
+        ///Return a binned floating-point reference bin
+        static inline constexpr const ftype* binned_bins(const int x)
+        {
+            return &bins[x];
+        }
+
+        ///Get the bit representation of a float
+        static inline uint32_t& get_bits(float& x)
+        {
+            return *reinterpret_cast<uint32_t*>(&x);
+        }
+        ///Get the bit representation of a double
+        static inline uint64_t& get_bits(double& x)
+        {
+            return *reinterpret_cast<uint64_t*>(&x);
+        }
+        ///Get the bit representation of a const float
+        static inline uint32_t get_bits(const float& x)
+        {
+            return *reinterpret_cast<const uint32_t*>(&x);
+        }
+        ///Get the bit representation of a const double
+        static inline uint64_t get_bits(const double& x)
+        {
+            return *reinterpret_cast<const uint64_t*>(&x);
+        }
+
+        ///Return a pointer to the primary vector
+        inline ftype* pvec()
+        {
+            return &data[0];
+        }
+        ///Return a pointer to the carry vector
+        inline ftype* cvec()
+        {
+            return &data[FOLD];
+        }
+        ///Return a const pointer to the primary vector
+        inline const ftype* pvec() const
+        {
+            return &data[0];
+        }
+        ///Return a const pointer to the carry vector
+        inline const ftype* cvec() const
+        {
+            return &data[FOLD];
+        }
+
+        static inline constexpr int ISNANINF(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
+                ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
+        }
+
+        static inline constexpr int EXP(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1);
+        }
+
+        ///Get index of float-point precision
+        ///The index of a non-binned type is the smallest index a binned type would
+        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
+        ///bins.
+        static inline constexpr int binned_dindex(const ftype x)
+        {
+            int exp = EXP(x);
+            if (exp == 0)
+            {
+                if (x == 0.0)
+                {
+                    return MAXINDEX;
+                }
+                else
+                {
+                    frexp(x, &exp);
+                    return std::min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                }
+            }
+            return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
+        }
+
+        ///Get index of manually specified binned double precision
+        ///The index of a binned type is the bin that it corresponds to. Higher
+        ///indicies correspond to smaller bins.
+        inline int binned_index() const
+        {
+            return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
+                       EXP(pvec()[0])) /
+                BIN_WIDTH;
+        }
+
+        ///Check if index of manually specified binned floating-point is 0
+        ///A quick check to determine if the index is 0
+        inline bool binned_index0() const
+        {
+            return EXP(pvec()[0]) == MAX_EXP + EXP_BIAS;
+        }
+
+        ///Update manually specified binned fp with a scalar (X -> Y)
+        ///
+        ///This method updates the binned fp to an index suitable for adding numbers
+        ///with absolute value less than @p max_abs_val
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdupdate(
+            const ftype max_abs_val, const int incpriY, const int inccarY)
+        {
+            int i;
+            int j;
+            int X_index;
+            int shift;
+            auto* const priY = pvec();
+            auto* const carY = cvec();
+
+            if (ISNANINF(priY[0]))
+            {
+                return;
+            }
+
+            X_index = binned_dindex(max_abs_val);
+            if (priY[0] == 0.0)
+            {
+                const ftype* const bins = binned_bins(X_index);
+                for (i = 0; i < FOLD; i++)
+                {
+                    priY[i * incpriY] = bins[i];
+                    carY[i * inccarY] = 0.0;
+                }
+            }
+            else
+            {
+                shift = binned_index() - X_index;
+                if (shift > 0)
+                {
+                    for (i = FOLD - 1; i >= shift; i--)
+                    {
+                        priY[i * incpriY] = priY[(i - shift) * incpriY];
+                        carY[i * inccarY] = carY[(i - shift) * inccarY];
+                    }
+                    const ftype* const bins = binned_bins(X_index);
+                    for (j = 0; j < i + 1; j++)
+                    {
+                        priY[j * incpriY] = bins[j];
+                        carY[j * inccarY] = 0.0;
+                    }
+                }
+            }
+        }
+
+        ///Add scalar @p X to suitably binned manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y where the index of Y is
+        ///larger than the index of @p X
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        void binned_dmddeposit(const ftype X, const int incpriY)
+        {
+            ftype M;
+            int i;
+            ftype x = X;
+            auto* const priY = pvec();
+
+            if (ISNANINF(x) || ISNANINF(priY[0]))
+            {
+                priY[0] += x;
+                return;
+            }
+
+            if (binned_index0())
+            {
+                M = priY[0];
+                ftype qd = x * COMPRESSION;
+                auto& ql = get_bits(qd);
+                ql |= 1;
+                qd += M;
+                priY[0] = qd;
+                M -= qd;
+                M *= EXPANSION * 0.5;
+                x += M;
+                x += M;
+                for (i = 1; i < FOLD - 1; i++)
+                {
+                    M = priY[i * incpriY];
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    priY[i * incpriY] = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                priY[i * incpriY] += qd;
+            }
+            else
+            {
+                ftype qd = x;
+                auto& ql = get_bits(qd);
+                for (i = 0; i < FOLD - 1; i++)
+                {
+                    M = priY[i * incpriY];
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    priY[i * incpriY] = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                priY[i * incpriY] += qd;
+            }
+        }
+
+        ///Renormalize manually specified binned double precision
+        ///
+        ///Renormalization keeps the primary vector within the necessary bins by
+        ///shifting over to the carry vector
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        void binned_dmrenorm(const int incpriX, const int inccarX)
+        {
+            auto* priX = pvec();
+            auto* carX = cvec();
+
+            if (priX[0] == 0.0 || ISNANINF(priX[0]))
+            {
+                return;
+            }
+
+            for (int i = 0; i < FOLD; i++, priX += incpriX, carX += inccarX)
+            {
+                auto tmp_renormd = priX[0];
+                auto& tmp_renorml = get_bits(tmp_renormd);
+
+                carX[0] += (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
+
+                tmp_renorml &= ~(1ull << (MANT_DIG - 3));
+                tmp_renorml |= 1ull << (MANT_DIG - 2);
+                priX[0] = tmp_renormd;
+            }
+        }
+
+        ///Add scalar to manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdadd(const ftype X, const int incpriY, const int inccarY)
+        {
+            binned_dmdupdate(X, incpriY, inccarY);
+            binned_dmddeposit(X, incpriY);
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Convert manually specified binned fp to native double-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        double binned_conv_double(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+
+            const auto* const priX = pvec();
+            const auto* const carX = cvec();
+
+            if (ISNANINF(priX[0]))
+            {
+                return priX[0];
+            }
+
+            if (priX[0] == 0.0)
+            {
+                return 0.0;
+            }
+
+            double Y = 0.0;
+            double scale_down;
+            double scale_up;
+            int scaled;
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
+            {
+                scale_down = ldexp_impl(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
+                scale_up = ldexp_impl(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
+                scaled = std::max(
+                    std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
+                if (X_index == 0)
+                {
+                    Y += carX[0] * ((bins[0] / 6.0) * scale_down * EXPANSION);
+                    Y += carX[inccarX] * ((bins[1] / 6.0) * scale_down);
+                    Y += (priX[0] - bins[0]) * scale_down * EXPANSION;
+                    i = 2;
+                }
+                else
+                {
+                    Y += carX[0] * ((bins[0] / 6.0) * scale_down);
+                    i = 1;
+                }
+                for (; i < scaled; i++)
+                {
+                    Y += carX[i * inccarX] * ((bins[i] / 6.0) * scale_down);
+                    Y += (priX[(i - 1) * incpriX] - bins[i - 1]) * scale_down;
+                }
+                if (i == FOLD)
+                {
+                    Y += (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]) *
+                        scale_down;
+                    return Y * scale_up;
+                }
+                if (std::isinf(Y * scale_up))
+                {
+                    return Y * scale_up;
+                }
+                Y *= scale_up;
+                for (; i < FOLD; i++)
+                {
+                    Y += carX[i * inccarX] * (bins[i] / 6.0);
+                    Y += priX[(i - 1) * incpriX] - bins[i - 1];
+                }
+                Y += priX[(FOLD - 1) * incpriX] - bins[FOLD - 1];
+            }
+            else
+            {
+                Y += carX[0] * (bins[0] / 6.0);
+                for (i = 1; i < FOLD; i++)
+                {
+                    Y += carX[i * inccarX] * (bins[i] / 6.0);
+                    Y += (priX[(i - 1) * incpriX] - bins[i - 1]);
+                }
+                Y += (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]);
+            }
+            return Y;
+        }
+
+        ///Convert manually specified binned fp to native single-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        float binned_conv_single(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+            double Y = 0.0;
+            const auto* const priX = pvec();
+            const auto* const carX = cvec();
+
+            if (ISNANINF(priX[0]))
+            {
+                return priX[0];
+            }
+
+            if (priX[0] == 0.0)
+            {
+                return 0.0;
+            }
+
+            //Note that the following order of summation is in order of decreasing
+            //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
+            //the number of carries equal to 1.
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index == 0)
+            {
+                Y += (double) carX[0] * (double) (bins[0] / 6.0) *
+                    (double) EXPANSION;
+                Y += (double) carX[inccarX] * (double) (bins[1] / 6.0);
+                Y += (double) (priX[0] - bins[0]) * (double) EXPANSION;
+                i = 2;
+            }
+            else
+            {
+                Y += (double) carX[0] * (double) (bins[0] / 6.0);
+                i = 1;
+            }
+            for (; i < FOLD; i++)
+            {
+                Y += (double) carX[i * inccarX] * (double) (bins[i] / 6.0);
+                Y += (double) (priX[(i - 1) * incpriX] - bins[i - 1]);
+            }
+            Y += (double) (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]);
+
+            return (float) Y;
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        ///
+        ///@param other   Another binned fp of the same type
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdmadd(const ReproducibleFloatingAccumulator& other,
+            const int incpriX, const int inccarX, const int incpriY,
+            const int inccarY)
+        {
+            auto* const priX = pvec();
+            auto* const carX = cvec();
+            auto* const priY = other.pvec();
+            auto* const carY = other.cvec();
+
+            if (priX[0] == 0.0)
+                return;
+
+            if (priY[0] == 0.0)
+            {
+                for (int i = 0; i < FOLD; i++)
+                {
+                    priY[i * incpriY] = priX[i * incpriX];
+                    carY[i * inccarY] = carX[i * inccarX];
+                }
+                return;
+            }
+
+            if (ISNANINF(priX[0]) || ISNANINF(priY[0]))
+            {
+                priY[0] += priX[0];
+                return;
+            }
+
+            const auto X_index = binned_index(priX);
+            const auto Y_index = binned_index(priY);
+            const auto shift = Y_index - X_index;
+            if (shift > 0)
+            {
+                const auto* const bins = binned_bins(Y_index);
+                //shift Y upwards and add X to Y
+                for (int i = FOLD - 1; i >= shift; i--)
+                {
+                    priY[i * incpriY] = priX[i * incpriX] +
+                        (priY[(i - shift) * incpriY] - bins[i - shift]);
+                    carY[i * inccarY] =
+                        carX[i * inccarX] + carY[(i - shift) * inccarY];
+                }
+                for (int i = 0; i < shift && i < FOLD; i++)
+                {
+                    priY[i * incpriY] = priX[i * incpriX];
+                    carY[i * inccarY] = carX[i * inccarX];
+                }
+            }
+            else
+            {
+                const auto* const bins = binned_bins(X_index);
+                //shift X upwards and add X to Y
+                for (int i = 0 - shift; i < FOLD; i++)
+                {
+                    priY[i * incpriY] +=
+                        priX[(i + shift) * incpriX] - bins[i + shift];
+                    carY[i * inccarY] += carX[(i + shift) * inccarX];
+                }
+            }
+
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        void binned_dbdbadd(const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dmdmadd(other, 1, 1, 1, 1);
+        }
+
+    public:
+        ///Set the binned fp to zero
+        void zero()
+        {
+            data = {0};
+        }
+
+        ///Return the fold of the binned fp
+        int fold() const
+        {
+            return FOLD;
+        }
+
+        ///Returns the number of reference bins. Used for judging memory usage.
+        static constexpr size_t number_of_reference_bins()
+        {
+            return bins.size();
+        }
+
+        ///Accumulate an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if<std::is_arithmetic<U>::value>::type* =
+                nullptr>
+        ReproducibleFloatingAccumulator& operator+=(const U x)
+        {
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate-subtract an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if<std::is_arithmetic<U>::value>::type* =
+                nullptr>
+        ReproducibleFloatingAccumulator& operator-=(const U x)
+        {
+            binned_dmdadd(-static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate a binned fp @p x into the binned fp.
+        ReproducibleFloatingAccumulator& operator+=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dbdbadd(other);
+            return *this;
+        }
+
+        ///Accumulate-subtract a binned fp @p x into the binned fp.
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator& operator-=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            const auto temp = -other;
+            binned_dbdbadd(temp);
+        }
+
+        ///Determines if two binned fp are equal
+        bool operator==(const ReproducibleFloatingAccumulator& other) const
+        {
+            return data == other.data;
+        }
+
+        ///Determines if two binned fp are not equal
+        bool operator!=(const ReproducibleFloatingAccumulator& other) const
+        {
+            return !operator==(other);
+        }
+
+        ///Sets this binned fp equal to the arithmetic value @p x
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if<std::is_arithmetic<U>::value>::type* =
+                nullptr>
+        ReproducibleFloatingAccumulator& operator=(const U x)
+        {
+            zero();
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Sets this binned fp equal to another binned fp
+        ReproducibleFloatingAccumulator& operator=(
+            const ReproducibleFloatingAccumulator<ftype, FOLD>& o)
+        {
+            data = o.data;
+            return *this;
+        }
+
+        ///Returns the negative of this binned fp
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator operator-()
+        {
+            constexpr int incpriX = 1;
+            constexpr int inccarX = 1;
+            ReproducibleFloatingAccumulator temp = *this;
+            if (pvec()[0] != 0.0)
+            {
+                const auto* const bins = binned_bins(binned_index());
+                for (int i = 0; i < FOLD; i++)
+                {
+                    temp.pvec()[i * incpriX] =
+                        bins[i] - (pvec()[i * incpriX] - bins[i]);
+                    temp.cvec()[i * inccarX] = -cvec()[i * inccarX];
+                }
+            }
+            return temp;
+        }
+
+        ///Convert this binned fp into its native floating-point representation
+        ftype conv() const
+        {
+            if (std::is_same<ftype, float>::value)
+            {
+                return binned_conv_single(1, 1);
+            }
+            else
+            {
+                return binned_conv_double(1, 1);
+            }
+        }
+
+        ///@brief Get binned fp summation error bound
+        ///
+        ///This is a bound on the absolute error of a summation using binned types
+        ///
+        ///@param N           The number of single precision floating point summands
+        ///@param max_abs_val The summand of maximum absolute value
+        ///@param binned_sum  The value of the sum computed using binned types
+        ///@return            The absolute error bound
+        static constexpr ftype error_bound(
+            const uint64_t N, const ftype max_abs_val, const ftype binned_sum)
+        {
+            const double X = std::abs(max_abs_val);
+            const double S = std::abs(binned_sum);
+            return static_cast<ftype>(std::max(X, ldexp_impl(0.5, MIN_EXP - 1)) *
+                    ldexp_impl(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
+                ((7.0 * EPSILON) /
+                    (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
+                        7.0 * EPSILON)) *
+                    S);
+        }
+
+        ///Add @p x to the binned fp
+        void add(const ftype x)
+        {
+            binned_dmdadd(x, 1, 1);
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last, const ftype max_abs_val)
+        {
+            binned_dmdupdate(std::abs(max_abs_val), 1, 1);
+            size_t count = 0;
+            for (; first != last; first++, count++)
+            {
+                binned_dmddeposit(static_cast<ftype>(*first), 1);
+                if (count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last)
+        {
+            const auto max_abs_val = *std::max_element(
+                first, last, [](const auto& a, const auto& b) {
+                    return std::abs(a) < std::abs(b);
+                });
+            add(first, last, static_cast<ftype>(max_abs_val));
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename T,
+            typename std::enable_if<std::is_arithmetic<T>::value>::type* =
+                nullptr>
+        void add(const T* input, const size_t N, const ftype max_abs_val)
+        {
+            if (N == 0)
+            {
+                return;
+            }
+            add(input, input + N, max_abs_val);
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        template <typename T,
+            typename std::enable_if<std::is_arithmetic<T>::value>::type* =
+                nullptr>
+        void add(const T* input, const size_t N)
+        {
+            if (N == 0)
+            {
+                return;
+            }
+            T max_abs_val = input[0];
+            for (size_t i = 0; i < N; i++)
+            {
+                max_abs_val = std::max(max_abs_val, std::abs(input[i]));
+            }
+            add(input, N, max_abs_val);
+        }
+
+        //////////////////////////////////////
+        //MANUAL OPERATIONS; USE WISELY
+        //////////////////////////////////////
+
+        ///Rebins for repeated accumulation of scalars with magnitude <= @p mav
+        ///
+        ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator
+        ///with `unsafe_add` after which `renorm()` must be called. See the source of
+        ///`add()` for an example
+        template <typename T,
+            typename std::enable_if<std::is_arithmetic<T>::value>::type* =
+                nullptr>
+        void set_max_abs_val(const T mav)
+        {
+            binned_dmdupdate(std::abs(mav), 1, 1);
+        }
+
+        ///Add @p x to the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()`
+        void unsafe_add(const ftype x)
+        {
+            binned_dmddeposit(x, 1);
+        }
+
+        ///Renormalizes the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()` and one or
+        ///more calls to `unsafe_add()`
+        void renorm()
+        {
+            binned_dmrenorm(1, 1);
+        }
+    };
+}    // namespace RFA
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
new file mode 100644
index 000000000000..56c8d3349898
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -0,0 +1,569 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+/// \file parallel/algorithms/reduce_deterministic.hpp
+/// \page hpx::reduce_deterministic
+/// \headerfile hpx/algorithm.hpp
+
+#pragma once
+
+#if defined(DOXYGEN)
+
+namespace hpx {
+
+    // clang-format off
+
+    /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         predicate \a f.
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    /// \tparam F           The type of the function/function object to use
+    ///                     (deduced). Unlike its sequential form, the parallel
+    ///                     overload of \a reduce requires \a F to meet the
+    ///                     requirements of \a CopyConstructible.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    /// \param f            Specifies the function (or function object) which
+    ///                     will be invoked for each of the elements in the
+    ///                     sequence specified by [first, last). This is a
+    ///                     binary predicate. The signature of this predicate
+    ///                     should be equivalent to:
+    ///                     \code
+    ///                     Ret fun(const Type1 &a, const Type1 &b);
+    ///                     \endcode \n
+    ///                     The signature does not need to have const&.
+    ///                     The types \a Type1 \a Ret must be
+    ///                     such that an object of type \a FwdIter can be
+    ///                     dereferenced and then implicitly converted to any
+    ///                     of those types.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a copy_if algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns \a T otherwise.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum over the elements given by the input range
+    ///           [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(op, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter, typename F,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init, F&& f);
+
+    /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a copy_if algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns \a T otherwise.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    util::detail::algorithm_result_t<ExPolicy, T>
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init);
+
+    /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns T otherwise (where T is the value_type of
+    ///           \a FwdIter).
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   The type of the initial value (and the result type) \a T is
+    ///         determined from the value_type of the used \a FwdIter.
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter>
+    typename hpx::parallel::util::detail::algorithm_result<ExPolicy,
+        typename std::iterator_traits<FwdIter>::value_type
+    >::type
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last);
+
+    /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         predicate \a f.
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    /// \tparam F           The type of the function/function object to use
+    ///                     (deduced). Unlike its sequential form, the parallel
+    ///                     overload of \a reduce requires \a F to meet the
+    ///                     requirements of \a CopyConstructible.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    /// \param f            Specifies the function (or function object) which
+    ///                     will be invoked for each of the elements in the
+    ///                     sequence specified by [first, last). This is a
+    ///                     binary predicate. The signature of this predicate
+    ///                     should be equivalent to:
+    ///                     \code
+    ///                     Ret fun(const Type1 &a, const Type1 &b);
+    ///                     \endcode \n
+    ///                     The signature does not need to have const&.
+    ///                     The types \a Type1 \a Ret must be
+    ///                     such that an object of type \a InIter can be
+    ///                     dereferenced and then implicitly converted to any
+    ///                     of those types.
+    ///
+    /// \returns  The \a reduce algorithm returns \a T.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum over the elements given by the input range
+    ///           [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(op, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter, typename F,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    T reduce_deterministic(FwdIter first, FwdIter last, T init, F&& f);
+
+    /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a T.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    T reduce_deterministic(FwdIter first, FwdIter last, T init);
+
+    /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    ///
+    /// \returns  The \a reduce algorithm returns \a T (where T is the
+    ///           value_type of \a FwdIter).
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   The type of the initial value (and the result type) \a T is
+    ///         determined from the value_type of the used \a FwdIter.
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter>
+    typename std::iterator_traits<FwdIter>::value_type
+    reduce_deterministic(FwdIter first, FwdIter last);
+    // clang-format on
+}    // namespace hpx
+
+#else    // DOXYGEN
+
+#include <hpx/config.hpp>
+#include <hpx/concepts/concepts.hpp>
+#include <hpx/executors/execution_policy.hpp>
+#include <hpx/iterator_support/range.hpp>
+#include <hpx/pack_traversal/unwrap.hpp>
+#include <hpx/parallel/algorithms/detail/accumulate.hpp>
+#include <hpx/parallel/algorithms/detail/dispatch.hpp>
+#include <hpx/parallel/algorithms/detail/distance.hpp>
+#include <hpx/parallel/algorithms/detail/reduce.hpp>
+#include <hpx/parallel/algorithms/detail/reduce_deterministic.hpp>
+#include <hpx/parallel/util/detail/algorithm_result.hpp>
+#include <hpx/parallel/util/detail/sender_util.hpp>
+#include <hpx/parallel/util/loop.hpp>
+#include <hpx/parallel/util/partitioner.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+namespace hpx::parallel {
+
+    ///////////////////////////////////////////////////////////////////////////
+    // reduce
+    namespace detail {
+
+        /// \cond NOINTERNAL
+        template <typename T>
+        struct reduce_deterministic
+          : public algorithm<reduce_deterministic<T>, T>
+        {
+            constexpr reduce_deterministic() noexcept
+              : algorithm<reduce_deterministic, T>("reduce_deterministic")
+            {
+            }
+
+            template <typename ExPolicy, typename InIterB, typename InIterE,
+                typename T_, typename Reduce>
+            static constexpr T sequential(ExPolicy&& policy, InIterB first,
+                InIterE last, T_&& init, Reduce&& r)
+            {
+                // hpx::parallel::detail::sequential_reduce_deterministic_t<ExPolicy> seq;
+                // return seq(policy,first,last,0.0f,r);
+                return hpx::parallel::detail::sequential_reduce_deterministic<ExPolicy>(
+                    HPX_FORWARD(ExPolicy, policy), first, last,
+                    HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r));
+            }
+
+            //     template <typename ExPolicy, typename FwdIterB, typename FwdIterE,
+            //         typename T_, typename Reduce>
+            //     static util::detail::algorithm_result_t<ExPolicy, T> parallel(
+            //         ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init,
+            //         Reduce&& r)
+            //     {
+            //         if (first == last)
+            //         {
+            //             return util::detail::algorithm_result<ExPolicy, T>::get(
+            //                 HPX_FORWARD(T_, init));
+            //         }
+
+            //         auto f1 = [r](FwdIterB part_begin, std::size_t part_size) -> T {
+            //             T val = *part_begin;
+            //             return detail::sequential_reduce<ExPolicy>(
+            //                 ++part_begin, --part_size, HPX_MOVE(val), r);
+            //         };
+
+            //         return util::partitioner<ExPolicy, T>::call(
+            //             HPX_FORWARD(ExPolicy, policy), first,
+            //             detail::distance(first, last), HPX_MOVE(f1),
+            //             hpx::unwrapping(
+            //                 [init = HPX_FORWARD(T_, init),
+            //                     r = HPX_FORWARD(Reduce, r)](auto&& results) -> T {
+            //                     return detail::sequential_reduce<ExPolicy>(
+            //                         hpx::util::begin(results),
+            //                         hpx::util::size(results), init, r);
+            //                 }));
+            //     }
+        };
+        /// \endcond
+    }    // namespace detail
+}    // namespace hpx::parallel
+
+namespace hpx {
+
+    ///////////////////////////////////////////////////////////////////////////
+    // CPO for hpx::reduce
+    inline constexpr struct reduce_deterministic_t final
+      : hpx::detail::tag_parallel_algorithm<reduce_deterministic_t>
+    {
+    private:
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter, typename F,
+            typename T = typename std::iterator_traits<FwdIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last, T init, F f)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init),
+                HPX_MOVE(f));
+        }
+
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter,
+            typename T = typename std::iterator_traits<FwdIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last, T init)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init),
+                std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy,
+            typename std::iterator_traits<FwdIter>::value_type>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            using value_type =
+                typename std::iterator_traits<FwdIter>::value_type;
+
+            return hpx::parallel::detail::reduce_deterministic<value_type>()
+                .call(HPX_FORWARD(ExPolicy, policy), first, last, value_type{},
+                    std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename InIter, typename F,
+            typename T = typename std::iterator_traits<InIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend T tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last, T init, F f)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                hpx::execution::seq, first, last, HPX_MOVE(init), HPX_MOVE(f));
+        }
+
+        // clang-format off
+        template <typename InIter,
+            typename T = typename std::iterator_traits<InIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend T tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last, T init)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                hpx::execution::seq, first, last, HPX_MOVE(init),
+                std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename InIter,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend typename std::iterator_traits<InIter>::value_type
+        tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            using value_type =
+                typename std::iterator_traits<InIter>::value_type;
+
+            return hpx::parallel::detail::reduce_deterministic<value_type>()
+                .call(hpx::execution::seq, first, last, value_type{},
+                    std::plus<>());
+        }
+    } reduce_deterministic{};
+}    // namespace hpx
+
+#endif    // DOXYGEN
diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
index 7d3a9204530c..559ee830030e 100644
--- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
@@ -94,6 +94,7 @@ set(tests
     partition_copy
     reduce_
     reduce_by_key
+    reduce_deterministic
     remove
     remove1
     remove2
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
new file mode 100644
index 000000000000..e2229a992fe4
--- /dev/null
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -0,0 +1,98 @@
+//  Copyright (c) 2014-2020 Hartmut Kaiser
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <hpx/modules/testing.hpp>
+#include <hpx/parallel/algorithms/reduce_deterministic.hpp>
+#include <hpx/init.hpp>
+
+#include <cstddef>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "test_utils.hpp"
+
+int seed = std::random_device{}();
+std::mt19937 gen(seed);
+
+///////////////////////////////////////////////////////////////////////////////
+template <typename IteratorTag>
+void test_reduce1(IteratorTag)
+{
+    using base_iterator = std::vector<float>::iterator;
+    using iterator = test::test_iterator<base_iterator, IteratorTag>;
+
+    std::vector<float> c(10007);
+    std::iota(std::begin(c), std::end(c), gen());
+
+    float val(42);
+    auto op = [](auto v1, auto v2) { return v1 + v2; };
+
+    int r1 =
+        hpx::reduce_deterministic(iterator(std::begin(c)), iterator(std::end(c)), val, op);
+
+    // verify values
+    int r2 = std::accumulate(std::begin(c), std::end(c), val, op);
+    HPX_TEST_EQ(r1, r2);
+}
+
+template <typename IteratorTag>
+void test_reduce1()
+{
+    using namespace hpx::execution;
+
+    test_reduce1(IteratorTag());
+}
+
+void reduce_test1()
+{
+    test_reduce1<std::random_access_iterator_tag>();
+    test_reduce1<std::forward_iterator_tag>();
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+int hpx_main(hpx::program_options::variables_map& vm)
+{
+    unsigned int seed = (unsigned int) std::time(nullptr);
+    if (vm.count("seed"))
+        seed = vm["seed"].as<unsigned int>();
+
+    std::cout << "using seed: " << seed << std::endl;
+    gen.seed(seed);
+
+    reduce_test1();
+
+    return hpx::local::finalize();
+}
+
+int main(int argc, char* argv[])
+{
+    // add command line option which controls the random number generator seed
+    using namespace hpx::program_options;
+    options_description desc_commandline(
+        "Usage: " HPX_APPLICATION_STRING " [options]");
+
+    desc_commandline.add_options()("seed,s", value<unsigned int>(),
+        "the random number generator seed to use for this run");
+    // By default this test should run on all available cores
+    std::vector<std::string> const cfg = {"hpx.os_threads=all"};
+
+    // Initialize and run HPX
+    hpx::local::init_params init_args;
+    init_args.desc_cmdline = desc_commandline;
+    init_args.cfg = cfg;
+
+    HPX_TEST_EQ_MSG(hpx::local::init(hpx_main, argc, argv, init_args), 0,
+        "HPX main exited with non-zero status");
+
+    return hpx::util::report_errors();
+}

From 7e070e9a80c14b6d6fcd291de5ee9379039773de Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Wed, 4 Dec 2024 00:51:55 -0600
Subject: [PATCH 4/9] Fix some issues and add tests for determinism

- Add Kate's CUDA impl for RFA

TODO:
- Use original RFA instead of Kate's
- Make a parallel version out of it
- Make a partition vector suitable version

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 cmake/HPX_AddConfigTest.cmake                 |    2 +-
 .../detail/reduce_deterministic.hpp           |   22 +-
 .../hpx/parallel/algorithms/detail/rfa.hpp    |   63 +-
 .../parallel/algorithms/detail/rfa_cuda.hpp   | 1168 +++++++++++++++++
 .../tests/unit/algorithms/CMakeLists.txt      |    4 +
 .../unit/algorithms/reduce_deterministic.cpp  |  208 ++-
 6 files changed, 1420 insertions(+), 47 deletions(-)
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp

diff --git a/cmake/HPX_AddConfigTest.cmake b/cmake/HPX_AddConfigTest.cmake
index 92b24ceec194..b0e1ea437452 100644
--- a/cmake/HPX_AddConfigTest.cmake
+++ b/cmake/HPX_AddConfigTest.cmake
@@ -408,7 +408,7 @@ function(hpx_check_for_cxx11_std_quick_exit)
   add_hpx_config_test(
     HPX_WITH_CXX11_STD_QUICK_EXIT
     SOURCE cmake/tests/cxx11_std_quick_exit.cpp
-    FILE ${ARGN}
+    FILE EXECUTE ${ARGN}
   )
 endfunction()
 
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index 79f78b8a858a..acf0cc687f56 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -9,10 +9,11 @@
 #include <hpx/config.hpp>
 #include <hpx/functional/detail/tag_fallback_invoke.hpp>
 #include <hpx/functional/invoke.hpp>
-#include <hpx/parallel/algorithms/detail/rfa.hpp>
+#include <hpx/parallel/algorithms/detail/rfa_cuda.hpp>
 #include <hpx/parallel/util/loop.hpp>
 
 #include <cstddef>
+#include <limits>
 #include <type_traits>
 #include <utility>
 #include "rfa.hpp"
@@ -31,9 +32,22 @@ namespace hpx::parallel::detail {
             sequential_reduce_deterministic_t, ExPolicy&&, InIterB first,
             InIterE last, T init, Reduce&& r)
         {
-            RFA::ReproducibleFloatingAccumulator<T> rfa;
-            rfa += init;
-            rfa.add(first, last);
+            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            bins.initialize_bins();
+            memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+
+            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+            rfa.set_max_abs_val(init);
+            rfa.unsafe_add(init);
+            rfa.renorm();
+            // rfa.add(first, last);
+            for (auto e = first; e != last; ++e)
+            {
+                // printf("%f \n",*e);
+                rfa.set_max_abs_val(*e);
+                rfa.unsafe_add(*e);
+                rfa.renorm();
+            }
             return rfa.conv();
         }
     };
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
index e7413add8e7e..340f4cc7b2ef 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -29,21 +29,26 @@
 
 namespace RFA {
     template <typename Real>
-inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
-{
-    while(exp > 0)
+    inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
     {
-        arg *= 2;
-        --exp;
+        return std::ldexp(arg, exp);
+        // while (arg == 0)
+        // {
+        //     return arg;
+        // }
+        // while (exp > 0)
+        // {
+        //     arg *= static_cast<Real>(2);
+        //     --exp;
+        // }
+        // while (exp < 0)
+        // {
+        //     arg /= static_cast<Real>(2);
+        //     ++exp;
+        // }
+
+        // return arg;
     }
-    while(exp < 0)
-    {
-        arg /= 2;
-        ++exp;
-    }
-
-    return arg;
-}
 
     ///Class to hold a reproducible summation of the numbers passed to it
     ///
@@ -54,7 +59,7 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
             nullptr>
     class ReproducibleFloatingAccumulator
     {
-    private:
+    public:
         std::array<ftype, 2 * FOLD> data = {0};
 
         ///Floating-point precision bin width
@@ -86,7 +91,8 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
         ///The number of deposits that can be performed before a renorm is necessary.
         ///Applies also to binned complex double precision.
         static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
-
+        // static_assert(ENDURANCE == 0);
+    public:
         ///Generates binned floating-point reference bins
         static constexpr std::array<ftype, MAXINDEX + MAXFOLD> initialize_bins()
         {
@@ -115,8 +121,9 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
         }
 
         ///The binned floating-point reference bins
-        static constexpr auto bins = initialize_bins();
+        static std::array<ftype, MAXINDEX + MAXFOLD> bins;
 
+    private:
         ///Return a binned floating-point reference bin
         static inline constexpr const ftype* binned_bins(const int x)
         {
@@ -519,10 +526,10 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
             const int incpriX, const int inccarX, const int incpriY,
             const int inccarY)
         {
-            auto* const priX = pvec();
-            auto* const carX = cvec();
-            auto* const priY = other.pvec();
-            auto* const carY = other.cvec();
+            auto* const priX = other.pvec();
+            auto* const carX = other.cvec();
+            auto* const priY = pvec();
+            auto* const carY = cvec();
 
             if (priX[0] == 0.0)
                 return;
@@ -543,8 +550,9 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
                 return;
             }
 
-            const auto X_index = binned_index(priX);
-            const auto Y_index = binned_index(priY);
+            // const auto X_index = binned_index(priX);
+            const auto X_index = other.binned_index();
+            const auto Y_index = binned_index();
             const auto shift = Y_index - X_index;
             if (shift > 0)
             {
@@ -721,7 +729,8 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
         {
             const double X = std::abs(max_abs_val);
             const double S = std::abs(binned_sum);
-            return static_cast<ftype>(std::max(X, ldexp_impl(0.5, MIN_EXP - 1)) *
+            return static_cast<ftype>(
+                std::max(X, ldexp_impl(0.5, MIN_EXP - 1)) *
                     ldexp_impl(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
                 ((7.0 * EPSILON) /
                     (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
@@ -848,4 +857,12 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
             binned_dmrenorm(1, 1);
         }
     };
+
+    template <>
+    auto ReproducibleFloatingAccumulator<float>::bins =
+        ReproducibleFloatingAccumulator<float>::initialize_bins();
+    
+    template <>
+    auto ReproducibleFloatingAccumulator<double>::bins =
+        ReproducibleFloatingAccumulator<double>::initialize_bins();
 }    // namespace RFA
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp
new file mode 100644
index 000000000000..05f71d9ae746
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp
@@ -0,0 +1,1168 @@
+//Reproducible Floating Point Accumulations via Binned Floating Point
+//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
+//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
+//
+//The code accomplishes several objectives:
+//
+//1. Reproducible summation, independent of summation order, assuming only a
+//   subset of the IEEE 754 Floating Point Standard
+//
+//2. Has accuracy at least as good as conventional summation, and tunable
+//
+//3. Handles overflow, underflow, and other exceptions reproducibly.
+//
+//4. Makes only one read-only pass over the summands.
+//
+//5. Requires only one parallel reduction.
+//
+//6. Uses minimal memory (6 doubles per accumulator with fold=3).
+//
+//7. Relatively easy to use
+
+#pragma once
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+#ifndef __CUDACC__
+#define __host__
+#define __device__
+#define __forceinline__
+#include <array>
+using std::array;
+using std::max;
+using std::min;
+#else
+#include <cuda/std/array>
+using cuda::std::array;
+using cuda::std::max;
+using cuda::std::min;
+#include "vector.hpp"
+#endif
+
+namespace hpx::parallel::detail::rfa {
+    template <typename F>
+    struct type4
+    {
+        F x;
+        F y;
+        F z;
+        F w;
+    };
+
+    template <typename F>
+    struct type2
+    {
+        F x;
+        F y;
+    };
+    using float4 = type4<float>;
+    using double4 = type4<double>;
+    using float2 = type2<float>;
+    using double2 = type2<double>;
+
+    auto abs_max(float4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<float> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<double> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(float2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<float> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<double> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+// disable zero checks
+#define DISABLE_ZERO
+
+// disable nan / infinity checks
+#define DISABLE_NANINF
+
+// jump table for indexing into data
+#define MAX_JUMP 5
+    static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max");
+
+    template <typename Real>
+    inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
+    {
+        return std::ldexp(arg, exp);
+        // while (arg == 0)
+        // {
+        //     return arg;
+        // }
+        // while (exp > 0)
+        // {
+        //     arg *= static_cast<Real>(2);
+        //     --exp;
+        // }
+        // while (exp < 0)
+        // {
+        //     arg /= static_cast<Real>(2);
+        //     ++exp;
+        // }
+
+        // return arg;
+    }
+
+    template <class ftype>
+    struct RFA_bins
+    {
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+
+        ///The binned floating-point reference bins
+        array<ftype, MAXINDEX + MAXFOLD> bins = {};
+
+        constexpr ftype& operator[](int d)
+        {
+            return bins[d];
+        }
+
+        void initialize_bins()
+        {
+            if constexpr (std::is_same_v<ftype, float>)
+            {
+                bins[0] = std::ldexp(0.75, MAX_EXP);
+            }
+            else
+            {
+                bins[0] = 2.0 * ldexp(0.75, MAX_EXP - 1);
+            }
+
+            for (int index = 1; index <= MAXINDEX; index++)
+            {
+                bins[index] = ldexp(0.75,
+                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
+            }
+            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
+            {
+                bins[index] = bins[index - 1];
+            }
+        }
+    };
+
+    static char bin_host_buffer[sizeof(RFA_bins<double>)];
+#ifdef __CUDACC__
+    __constant__ static char bin_device_buffer[sizeof(RFA_bins<double>)];
+#endif
+
+    ///Class to hold a reproducible summation of the numbers passed to it
+    ///
+    ///@param ftype Floating-point data type; either `float` or `double
+    ///@param FOLD  The fold; use 3 as a default unless you understand it.
+    template <class ftype_, int FOLD_ = 3,
+        typename std::enable_if_t<std::is_floating_point<ftype_>::value>* =
+            nullptr>
+    class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator
+    {
+    public:
+        using ftype = ftype_;
+        static constexpr int FOLD = FOLD_;
+
+    private:
+        array<ftype, 2 * FOLD> data = {0};
+
+        ///Floating-point precision bin width
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+        ///Binned floating-point compression factor
+        ///This factor is used to scale down inputs before deposition into the bin of
+        ///highest index
+        static constexpr auto COMPRESSION =
+            1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1));
+        ///Binned double precision expansion factor
+        ///This factor is used to scale up inputs after deposition into the bin of
+        ///highest index
+        static constexpr auto EXPANSION =
+            1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1));
+        static constexpr auto EXP_BIAS = MAX_EXP - 2;
+        static constexpr auto EPSILON = std::numeric_limits<ftype>::epsilon();
+        ///Binned floating-point deposit endurance
+        ///The number of deposits that can be performed before a renorm is necessary.
+        ///Applies also to binned complex double precision.
+        static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
+
+        ///Return a binned floating-point reference bin
+        inline const ftype* binned_bins(const int x) const
+        {
+#ifdef __CUDA_ARCH__    // must be arch not CC here
+            return &reinterpret_cast<RFA_bins<ftype>&>(bin_device_buffer)[x];
+#else
+            return &reinterpret_cast<RFA_bins<ftype>&>(bin_host_buffer)[x];
+#endif
+        }
+
+        ///Get the bit representation of a float
+        static inline uint32_t& get_bits(float& x)
+        {
+            return *reinterpret_cast<uint32_t*>(&x);
+        }
+        ///Get the bit representation of a double
+        static inline uint64_t& get_bits(double& x)
+        {
+            return *reinterpret_cast<uint64_t*>(&x);
+        }
+        ///Get the bit representation of a const float
+        static inline uint32_t get_bits(const float& x)
+        {
+            return *reinterpret_cast<const uint32_t*>(&x);
+        }
+        ///Get the bit representation of a const double
+        static inline uint64_t get_bits(const double& x)
+        {
+            return *reinterpret_cast<const uint64_t*>(&x);
+        }
+
+        ///Return primary vector value const ref
+        inline const ftype& primary(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[4];
+                default:
+                    return data[FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[i];
+            }
+        }
+
+        ///Return carry vector value const ref
+        inline const ftype& carry(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[FOLD + 0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[FOLD + 1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[FOLD + 2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[FOLD + 3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[FOLD + 4];
+                default:
+                    return data[2 * FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[FOLD + i];
+            }
+        }
+
+        ///Return primary vector value ref
+        inline ftype& primary(int i)
+        {
+            const auto& c = *this;
+            return const_cast<ftype&>(c.primary(i));
+        }
+
+        ///Return carry vector value ref
+        inline ftype& carry(int i)
+        {
+            const auto& c = *this;
+            return const_cast<ftype&>(c.carry(i));
+        }
+
+#ifdef DISABLE_ZERO
+        static inline constexpr bool ISZERO(const ftype)
+        {
+            return false;
+        }
+#else
+        static inline constexpr bool ISZERO(const ftype x)
+        {
+            return x == 0.0;
+        }
+#endif
+
+#ifdef DISABLE_NANINF
+        static inline constexpr int ISNANINF(const ftype)
+        {
+            return false;
+        }
+#else
+        static inline constexpr int ISNANINF(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
+                ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
+        }
+#endif
+
+        static inline constexpr int EXP(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1);
+        }
+
+        ///Get index of float-point precision
+        ///The index of a non-binned type is the smallest index a binned type would
+        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
+        ///bins.
+        static inline constexpr int binned_dindex(const ftype x)
+        {
+            int exp = EXP(x);
+            if (exp == 0)
+            {
+                if (x == 0.0)
+                {
+                    return MAXINDEX;
+                }
+                else
+                {
+                    frexp(x, &exp);
+                    return min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                }
+            }
+            return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
+        }
+
+        ///Get index of manually specified binned double precision
+        ///The index of a binned type is the bin that it corresponds to. Higher
+        ///indicies correspond to smaller bins.
+        inline int binned_index() const
+        {
+            return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
+                       EXP(primary(0))) /
+                BIN_WIDTH;
+        }
+
+        ///Check if index of manually specified binned floating-point is 0
+        ///A quick check to determine if the index is 0
+        inline bool binned_index0() const
+        {
+            return EXP(primary(0)) == MAX_EXP + EXP_BIAS;
+        }
+
+        ///Update manually specified binned fp with a scalar (X -> Y)
+        ///
+        ///This method updates the binned fp to an index suitable for adding numbers
+        ///with absolute value less than @p max_abs_val
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdupdate(
+            const ftype max_abs_val, const int incpriY, const int inccarY)
+        {
+            if (ISNANINF(primary(0)))
+                return;
+
+            int X_index = binned_dindex(max_abs_val);
+            if (ISZERO(primary(0)))
+            {
+                const ftype* const bins = binned_bins(X_index);
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) = bins[i];
+                    carry(i * inccarY) = 0.0;
+                }
+            }
+            else
+            {
+                int shift = binned_index() - X_index;
+                if (shift > 0)
+                {
+#pragma unroll
+                    for (int i = FOLD - 1; i >= 1; i--)
+                    {
+                        if (i < shift)
+                            break;
+                        primary(i * incpriY) = primary((i - shift) * incpriY);
+                        carry(i * inccarY) = carry((i - shift) * inccarY);
+                    }
+                    const ftype* const bins = binned_bins(X_index);
+#pragma unroll
+                    for (int j = 0; j < FOLD; j++)
+                    {
+                        if (j >= shift)
+                            break;
+                        primary(j * incpriY) = bins[j];
+                        carry(j * inccarY) = 0.0;
+                    }
+                }
+            }
+        }
+
+        ///Add scalar @p X to suitably binned manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y where the index of Y is
+        ///larger than the index of @p X
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        void binned_dmddeposit(const ftype X, const int incpriY)
+        {
+            ftype M;
+            ftype x = X;
+
+            if (ISNANINF(x) || ISNANINF(primary(0)))
+            {
+                primary(0) += x;
+                return;
+            }
+
+            if (binned_index0())
+            {
+                M = primary(0);
+                ftype qd = x * COMPRESSION;
+                auto& ql = get_bits(qd);
+                ql |= 1;
+                qd += M;
+                primary(0) = qd;
+                M -= qd;
+                M *= EXPANSION * 0.5;
+                x += M;
+                x += M;
+#pragma unroll
+                for (int i = 1; i < FOLD - 1; i++)
+                {
+                    M = primary(i * incpriY);
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    primary(i * incpriY) = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                primary((FOLD - 1) * incpriY) += qd;
+            }
+            else
+            {
+                ftype qd = x;
+                auto& ql = get_bits(qd);
+#pragma unroll
+                for (int i = 0; i < FOLD - 1; i++)
+                {
+                    M = primary(i * incpriY);
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    primary(i * incpriY) = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                primary((FOLD - 1) * incpriY) += qd;
+            }
+        }
+
+        ///Renormalize manually specified binned double precision
+        ///
+        ///Renormalization keeps the primary vector within the necessary bins by
+        ///shifting over to the carry vector
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        inline void binned_dmrenorm(const int incpriX, const int inccarX)
+        {
+            if (ISZERO(primary(0)) || ISNANINF(primary(0)))
+                return;
+
+            for (int i = 0; i < FOLD; i++)
+            {
+                auto tmp_renormd = primary(i * incpriX);
+                auto& tmp_renorml = get_bits(tmp_renormd);
+
+                carry(i * inccarX) +=
+                    (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
+
+                tmp_renorml &= ~(1ull << (MANT_DIG - 3));
+                tmp_renorml |= 1ull << (MANT_DIG - 2);
+                primary(i * incpriX) = tmp_renormd;
+            }
+        }
+
+        ///Add scalar to manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdadd(const ftype X, const int incpriY, const int inccarY)
+        {
+            binned_dmdupdate(X, incpriY, inccarY);
+            binned_dmddeposit(X, incpriY);
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Convert manually specified binned fp to native double-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        double binned_conv_double(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
+                return 0.0;
+
+            double Y = 0.0;
+            double scale_down;
+            double scale_up;
+            int scaled;
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
+            {
+                scale_down = ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
+                scale_up = ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
+                scaled =
+                    max(min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
+                if (X_index == 0)
+                {
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
+                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
+                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
+                    i = 2;
+                }
+                else
+                {
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
+                    i = 1;
+                }
+                for (; i < scaled; i++)
+                {
+                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
+                    Y +=
+                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
+                }
+                if (i == FOLD)
+                {
+                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
+                        scale_down;
+                    return Y * scale_up;
+                }
+                if (std::isinf(Y * scale_up))
+                {
+                    return Y * scale_up;
+                }
+                Y *= scale_up;
+                for (; i < FOLD; i++)
+                {
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += primary((i - 1) * incpriX) - bins[i - 1];
+                }
+                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
+            }
+            else
+            {
+                Y += carry(0) * (bins[0] / 6.0);
+                for (i = 1; i < FOLD; i++)
+                {
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
+                }
+                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+            }
+            return Y;
+        }
+
+        ///Convert manually specified binned fp to native single-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        float binned_conv_single(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+            double Y = 0.0;
+
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
+                return 0.0;
+
+            //Note that the following order of summation is in order of decreasing
+            //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
+            //the number of carries equal to 1.
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index == 0)
+            {
+                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
+                    (double) EXPANSION;
+                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
+                Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
+                i = 2;
+            }
+            else
+            {
+                Y += (double) carry(0) * (double) (bins[0] / 6.0);
+                i = 1;
+            }
+            for (; i < FOLD; i++)
+            {
+                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
+                Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
+            }
+            Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+
+            return (float) Y;
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        ///
+        ///@param other   Another binned fp of the same type
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdmadd(const ReproducibleFloatingAccumulator& x,
+            const int incpriX, const int inccarX, const int incpriY,
+            const int inccarY)
+        {
+            if (ISZERO(x.primary(0)))
+                return;
+
+            if (ISZERO(primary(0)))
+            {
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
+                }
+                return;
+            }
+
+            if (ISNANINF(x.primary(0)) || ISNANINF(primary(0)))
+            {
+                primary(0) += x.primary(0);
+                return;
+            }
+
+            const auto X_index = x.binned_index();
+            const auto Y_index = this->binned_index();
+            const auto shift = Y_index - X_index;
+            if (shift > 0)
+            {
+                const auto* const bins = binned_bins(Y_index);
+                //shift Y upwards and add X to Y
+#pragma unroll
+                for (int i = FOLD - 1; i >= 1; i--)
+                {
+                    if (i < shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX) +
+                        (primary((i - shift) * incpriY) - bins[i - shift]);
+                    carry(i * inccarY) =
+                        x.carry(i * inccarX) + carry((i - shift) * inccarY);
+                }
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    if (i == shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
+                }
+            }
+            else if (shift < 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+                //shift X upwards and add X to Y
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    if (i < -shift)
+                        continue;
+                    primary(i * incpriY) +=
+                        x.primary((i + shift) * incpriX) - bins[i + shift];
+                    carry(i * inccarY) += x.carry((i + shift) * inccarX);
+                }
+            }
+            else if (shift == 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+                // add X to Y
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
+                    carry(i * inccarY) += x.carry(i * inccarX);
+                }
+            }
+
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        void binned_dbdbadd(const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dmdmadd(other, 1, 1, 1, 1);
+        }
+
+    public:
+        ReproducibleFloatingAccumulator() = default;
+        ReproducibleFloatingAccumulator(
+            const ReproducibleFloatingAccumulator&) = default;
+        ///Sets this binned fp equal to another binned fp
+        ReproducibleFloatingAccumulator& operator=(
+            const ReproducibleFloatingAccumulator&) = default;
+
+        ///Set the binned fp to zero
+        void zero()
+        {
+            data = {0};
+        }
+
+        ///Return the fold of the binned fp
+        constexpr int fold() const
+        {
+            return FOLD;
+        }
+
+        ///Return the endurance of the binned fp
+        constexpr int endurance() const
+        {
+            return ENDURANCE;
+        }
+
+        ///Returns the number of reference bins. Used for judging memory usage.
+        constexpr size_t number_of_reference_bins()
+        {
+            return array<ftype, MAXINDEX + MAXFOLD>::size();
+        }
+
+        ///Accumulate an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator+=(const U x)
+        {
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate-subtract an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator-=(const U x)
+        {
+            binned_dmdadd(-static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate a binned fp @p x into the binned fp.
+        ReproducibleFloatingAccumulator& operator+=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dbdbadd(other);
+            return *this;
+        }
+
+        ///Accumulate-subtract a binned fp @p x into the binned fp.
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator& operator-=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            const auto temp = -other;
+            binned_dbdbadd(temp);
+        }
+
+        ///Determines if two binned fp are equal
+        bool operator==(const ReproducibleFloatingAccumulator& other) const
+        {
+            return data == other.data;
+        }
+
+        ///Determines if two binned fp are not equal
+        bool operator!=(const ReproducibleFloatingAccumulator& other) const
+        {
+            return !operator==(other);
+        }
+
+        ///Sets this binned fp equal to the arithmetic value @p x
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator=(const U x)
+        {
+            zero();
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Returns the negative of this binned fp
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator operator-()
+        {
+            constexpr int incpriX = 1;
+            constexpr int inccarX = 1;
+            ReproducibleFloatingAccumulator temp = *this;
+            if (primary(0) != 0.0)
+            {
+                const auto* const bins = binned_bins(binned_index());
+                for (int i = 0; i < FOLD; i++)
+                {
+                    temp.primary(i * incpriX) =
+                        bins[i] - (primary(i * incpriX) - bins[i]);
+                    temp.carry(i * inccarX) = -carry(i * inccarX);
+                }
+            }
+            return temp;
+        }
+
+        ///Convert this binned fp into its native floating-point representation
+        ftype conv() const
+        {
+            if (std::is_same_v<ftype, float>)
+            {
+                return binned_conv_single(1, 1);
+            }
+            else
+            {
+                return binned_conv_double(1, 1);
+            }
+        }
+
+        ///@brief Get binned fp summation error bound
+        ///
+        ///This is a bound on the absolute error of a summation using binned types
+        ///
+        ///@param N           The number of single precision floating point summands
+        ///@param max_abs_val The summand of maximum absolute value
+        ///@param binned_sum  The value of the sum computed using binned types
+        ///@return            The absolute error bound
+        static constexpr ftype error_bound(
+            const uint64_t N, const ftype max_abs_val, const ftype binned_sum)
+        {
+            const double X = std::abs(max_abs_val);
+            const double S = std::abs(binned_sum);
+            return static_cast<ftype>(max(X, ldexp(0.5, MIN_EXP - 1)) *
+                    ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
+                ((7.0 * EPSILON) /
+                    (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
+                        7.0 * EPSILON)) *
+                    S);
+        }
+
+        ///Add @p x to the binned fp
+        void add(const ftype x)
+        {
+            binned_dmdadd(x, 1, 1);
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last, const ftype max_abs_val)
+        {
+            binned_dmdupdate(std::abs(max_abs_val), 1, 1);
+            size_t count = 0;
+            size_t N = last - first;
+            for (; first != last; first++, count++)
+            {
+                binned_dmddeposit(static_cast<ftype>(*first), 1);
+                // first conditional allows compiler to remove the call here when possible
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last)
+        {
+            const auto max_abs_val = *std::max_element(
+                first, last, [](const auto& a, const auto& b) {
+                    return std::abs(a) < std::abs(b);
+                });
+            add(first, last, static_cast<ftype>(max_abs_val));
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void add(const T* input, const size_t N, const ftype max_abs_val)
+        {
+            if (N == 0)
+                return;
+            add(input, input + N, max_abs_val);
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void add(const T* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            T max_abs_val = input[0];
+            for (size_t i = 0; i < N; i++)
+            {
+                max_abs_val = max(max_abs_val, std::abs(input[i]));
+            }
+            add(input, N, max_abs_val);
+        }
+
+        ///Accumulate a float4 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float4& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            binned_dmddeposit(static_cast<ftype>(x.z), 1);
+            binned_dmddeposit(static_cast<ftype>(x.w), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const double2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        void add(const float4* input, const size_t N, float max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].z), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].w), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const double2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const float2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const float4* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const double2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const float2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        //////////////////////////////////////
+        //MANUAL OPERATIONS; USE WISELY
+        //////////////////////////////////////
+
+        ///Rebins for repeated accumulation of scalars with magnitude <= @p mav
+        ///
+        ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator
+        ///with `unsafe_add` after which `renorm()` must be called. See the source of
+        ///`add()` for an example
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void set_max_abs_val(const T mav)
+        {
+            binned_dmdupdate(std::abs(mav), 1, 1);
+        }
+
+        ///Add @p x to the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()`
+        void unsafe_add(const ftype x)
+        {
+            binned_dmddeposit(x, 1);
+        }
+
+        ///Renormalizes the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()` and one or
+        ///more calls to `unsafe_add()`
+        void renorm()
+        {
+            binned_dmrenorm(1, 1);
+        }
+    };
+
+    
+}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
index 559ee830030e..76dc5fcd9806 100644
--- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
@@ -246,3 +246,7 @@ foreach(test ${tests})
     "modules.algorithms.algorithms" ${test} ${${test}_PARAMETERS}
   )
 endforeach()
+
+target_compile_options(reduce_deterministic_test PRIVATE -fsanitize=address)
+
+target_link_options(reduce_deterministic_test PRIVATE -fsanitize=address)
\ No newline at end of file
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index e2229a992fe4..462b1473f332 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -1,4 +1,4 @@
-//  Copyright (c) 2014-2020 Hartmut Kaiser
+//  Copyright (c) 2024 Shreyas Atre
 //
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
@@ -6,16 +6,19 @@
 
 #pragma once
 
+#include <hpx/init.hpp>
 #include <hpx/modules/testing.hpp>
+#include <hpx/parallel/algorithms/detail/rfa_cuda.hpp>
+#include <hpx/parallel/algorithms/reduce.hpp>
 #include <hpx/parallel/algorithms/reduce_deterministic.hpp>
-#include <hpx/init.hpp>
 
+#include <algorithm>
 #include <cstddef>
 #include <iostream>
 #include <iterator>
+#include <limits>
 #include <numeric>
 #include <random>
-#include <string>
 #include <vector>
 
 #include "test_utils.hpp"
@@ -23,25 +26,147 @@
 int seed = std::random_device{}();
 std::mt19937 gen(seed);
 
+template <typename T>
+T get_rand(
+    T LO = std::numeric_limits<T>::min(), T HI = std::numeric_limits<T>::max())
+{
+    return LO +
+        static_cast<T>(std::rand()) / (static_cast<T>(RAND_MAX / (HI - LO)));
+}
+
 ///////////////////////////////////////////////////////////////////////////////
-template <typename IteratorTag>
+
+template <typename IteratorTag, typename FloatTypeDeterministic,
+    typename FloatTypeNonDeterministic, size_t LEN = 10007>
 void test_reduce1(IteratorTag)
 {
-    using base_iterator = std::vector<float>::iterator;
-    using iterator = test::test_iterator<base_iterator, IteratorTag>;
+    // check if different type for deterministic and nondeeterministic
+    // and same result i.e. correct computation
+    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
 
-    std::vector<float> c(10007);
-    std::iota(std::begin(c), std::end(c), gen());
+    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
 
-    float val(42);
-    auto op = [](auto v1, auto v2) { return v1 + v2; };
+    std::vector<FloatTypeDeterministic> deterministic(LEN);
+    std::vector<FloatTypeNonDeterministic> nondeterministic(LEN);
 
-    int r1 =
-        hpx::reduce_deterministic(iterator(std::begin(c)), iterator(std::end(c)), val, op);
+    std::iota(
+        deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0));
+
+    std::iota(nondeterministic.begin(), nondeterministic.end(),
+        FloatTypeNonDeterministic(0));
+
+    FloatTypeDeterministic val_det(0);
+    FloatTypeNonDeterministic val_non_det(0);
+    auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeDeterministic r1 =
+        hpx::reduce_deterministic(iterator_det(std::begin(deterministic)),
+            iterator_det(std::end(deterministic)), val_det, op);
 
     // verify values
-    int r2 = std::accumulate(std::begin(c), std::end(c), val, op);
-    HPX_TEST_EQ(r1, r2);
+    FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic)),
+        iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+
+    FloatTypeNonDeterministic r3 = std::accumulate(
+        nondeterministic.begin(), nondeterministic.end(), val_non_det);
+
+    HPX_TEST_EQ(r1, r3);
+    HPX_TEST_EQ(r2, r3);
+}
+
+template <typename IteratorTag, typename FloatTypeDeterministic,
+    size_t LEN = 10007>
+void test_reduce_determinism(IteratorTag)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result
+    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
+
+    constexpr auto num_bounds_det =
+        std::is_same_v<FloatTypeDeterministic, float> ? 1000.0f : 1000000.0f;
+
+    std::vector<FloatTypeDeterministic> deterministic(LEN);
+
+    for (size_t i = 0; i < LEN; ++i)
+    {
+        deterministic[i] =
+            get_rand<FloatTypeDeterministic>(-num_bounds_det, num_bounds_det);
+    }
+
+    std::vector<FloatTypeDeterministic> deterministic_shuffled = deterministic;
+
+    std::shuffle(
+        deterministic_shuffled.begin(), deterministic_shuffled.end(), gen);
+
+    FloatTypeDeterministic val_det(41.999);
+
+    auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeDeterministic r1 =
+        hpx::reduce_deterministic(iterator_det(std::begin(deterministic)),
+            iterator_det(std::end(deterministic)), val_det, op);
+
+    FloatTypeDeterministic r1_shuffled = hpx::reduce_deterministic(
+        iterator_det(std::begin(deterministic_shuffled)),
+        iterator_det(std::end(deterministic_shuffled)), val_det, op);
+
+    HPX_TEST_EQ(r1,
+        r1_shuffled);    // Deterministically calculated, should always satisfy
+}
+
+template <typename IteratorTag, typename FloatTypeNonDeterministic,
+    size_t LEN = 10007>
+void test_orig_reduce_determinism(IteratorTag)
+{
+    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
+
+    constexpr auto num_bounds_ndet =
+        std::is_same_v<FloatTypeNonDeterministic, float> ? 1000.0f : 1000000.0f;
+
+    std::vector<FloatTypeNonDeterministic> nondeterministic(LEN);
+    for (size_t i = 0; i < LEN; ++i)
+    {
+        nondeterministic[i] = get_rand<FloatTypeNonDeterministic>(
+            -num_bounds_ndet, num_bounds_ndet);
+    }
+    std::vector<FloatTypeNonDeterministic> nondeterministic_shuffled =
+        nondeterministic;
+    std::shuffle(nondeterministic_shuffled.begin(),
+        nondeterministic_shuffled.end(), gen);
+
+    FloatTypeNonDeterministic val_non_det(41.999);
+
+    auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic)),
+        iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+    FloatTypeNonDeterministic r2_shuffled = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic_shuffled)),
+        iterator_ndet(std::end(nondeterministic_shuffled)), val_non_det, op);
+
+    FloatTypeNonDeterministic r3 = std::accumulate(
+        nondeterministic.begin(), nondeterministic.end(), val_non_det);
+    FloatTypeNonDeterministic r3_shuffled =
+        std::accumulate(nondeterministic_shuffled.begin(),
+            nondeterministic_shuffled.end(), val_non_det);
+
+    /// failed around 131 times out of 1000 on macOS arm
+    /// Floating point addition is not necessarily associative,
+    /// might fail on an architecture not yet known with much higher precision
+    HPX_TEST_NEQ(r2, r2_shuffled);
+    HPX_TEST_NEQ(r3, r3_shuffled);
 }
 
 template <typename IteratorTag>
@@ -49,25 +174,67 @@ void test_reduce1()
 {
     using namespace hpx::execution;
 
-    test_reduce1(IteratorTag());
+    test_reduce1<IteratorTag, float, float, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, double, float, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
+}
+
+template <typename IteratorTag>
+void test_reduce2()
+{
+    using namespace hpx::execution;
+
+    test_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
+    test_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
+}
+
+template <typename IteratorTag>
+void test_reduce3()
+{
+    using namespace hpx::execution;
+
+    test_orig_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
+    test_orig_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
 }
 
 void reduce_test1()
 {
     test_reduce1<std::random_access_iterator_tag>();
-    test_reduce1<std::forward_iterator_tag>();
+    test_reduce2<std::random_access_iterator_tag>();
+    test_reduce3<std::random_access_iterator_tag>();
+    // test_reduce1<std::forward_iterator_tag>();
 }
 
-
 ///////////////////////////////////////////////////////////////////////////////
 int hpx_main(hpx::program_options::variables_map& vm)
 {
     unsigned int seed = (unsigned int) std::time(nullptr);
+    bool seed_random = false;
+
     if (vm.count("seed"))
+    {
         seed = vm["seed"].as<unsigned int>();
+        seed_random = true;
+    }
+
+    if (vm.count("seed-random"))
+        seed_random = vm["seed-random"].as<bool>();
 
-    std::cout << "using seed: " << seed << std::endl;
-    gen.seed(seed);
+    if (seed_random)
+    {
+        std::cout << "using seed: " << seed << std::endl;
+        std::cout << "** std::accumulate, hpx::reduce may fail due to "
+                     "non-determinism of the floating summation"
+                  << std::endl;
+        gen.seed(seed);
+        std::srand(seed);
+    }
+    else
+    {
+        gen.seed(223);
+        std::srand(223);
+    }
 
     reduce_test1();
 
@@ -83,6 +250,9 @@ int main(int argc, char* argv[])
 
     desc_commandline.add_options()("seed,s", value<unsigned int>(),
         "the random number generator seed to use for this run");
+    desc_commandline.add_options()("seed-random", value<bool>(),
+        "switch for the random number generator seed to use for this run");
+
     // By default this test should run on all available cores
     std::vector<std::string> const cfg = {"hpx.os_threads=all"};
 

From 7f9a3c898bf81a6cc46e7e9a36dc58ac8de2f607 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sat, 7 Dec 2024 00:44:59 -0600
Subject: [PATCH 5/9] Remove unnecessary things from rfa

- Also perform renorm and update only when necessary

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           |   22 +-
 .../hpx/parallel/algorithms/detail/rfa.hpp    |  714 ++++++----
 .../parallel/algorithms/detail/rfa_cuda.hpp   | 1168 -----------------
 .../unit/algorithms/reduce_deterministic.cpp  |    2 +-
 4 files changed, 512 insertions(+), 1394 deletions(-)
 delete mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index acf0cc687f56..76f093837def 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -9,7 +9,7 @@
 #include <hpx/config.hpp>
 #include <hpx/functional/detail/tag_fallback_invoke.hpp>
 #include <hpx/functional/invoke.hpp>
-#include <hpx/parallel/algorithms/detail/rfa_cuda.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
 #include <hpx/parallel/util/loop.hpp>
 
 #include <cstddef>
@@ -34,19 +34,29 @@ namespace hpx::parallel::detail {
         {
             hpx::parallel::detail::rfa::RFA_bins<T> bins;
             bins.initialize_bins();
-            memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+            std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
 
             hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
             rfa.set_max_abs_val(init);
             rfa.unsafe_add(init);
             rfa.renorm();
-            // rfa.add(first, last);
+            size_t count = 0;
+            T max_val = std::abs(*first);
             for (auto e = first; e != last; ++e)
             {
-                // printf("%f \n",*e);
-                rfa.set_max_abs_val(*e);
+                T temp_max_val = std::abs(static_cast<T>(*e));
+                if (max_val < temp_max_val)
+                {
+                    rfa.set_max_abs_val(temp_max_val);
+                    max_val = temp_max_val;
+                }
                 rfa.unsafe_add(*e);
-                rfa.renorm();
+                count++;
+                if (count == rfa.endurance())
+                {
+                    rfa.renorm();
+                    count = 0;
+                }
             }
             return rfa.conv();
         }
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
index 340f4cc7b2ef..4a9910cb6faa 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -27,7 +27,73 @@
 #include <cstdint>
 #include <limits>
 
-namespace RFA {
+namespace hpx::parallel::detail::rfa {
+    template <typename F>
+    struct type4
+    {
+        F x;
+        F y;
+        F z;
+        F w;
+    };
+
+    template <typename F>
+    struct type2
+    {
+        F x;
+        F y;
+    };
+    using float4 = type4<float>;
+    using double4 = type4<double>;
+    using float2 = type2<float>;
+    using double2 = type2<double>;
+
+    auto abs_max(float4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<float> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<double> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(float2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<float> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<double> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+// disable zero checks
+#define DISABLE_ZERO
+
+// disable nan / infinity checks
+#define DISABLE_NANINF
+
+// jump table for indexing into data
+#define MAX_JUMP 5
+    static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max");
+
     template <typename Real>
     inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
     {
@@ -50,21 +116,74 @@ namespace RFA {
         // return arg;
     }
 
+    template <class ftype>
+    struct RFA_bins
+    {
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+
+        ///The binned floating-point reference bins
+        std::array<ftype, MAXINDEX + MAXFOLD> bins = {};
+
+        constexpr ftype& operator[](int d)
+        {
+            return bins[d];
+        }
+
+        void initialize_bins()
+        {
+            if constexpr (std::is_same_v<ftype, float>)
+            {
+                bins[0] = std::ldexp(0.75, MAX_EXP);
+            }
+            else
+            {
+                bins[0] = 2.0 * std::ldexp(0.75, MAX_EXP - 1);
+            }
+
+            for (int index = 1; index <= MAXINDEX; index++)
+            {
+                bins[index] = std::ldexp(0.75,
+                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
+            }
+            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
+            {
+                bins[index] = bins[index - 1];
+            }
+        }
+    };
+
+    static char bin_host_buffer[sizeof(RFA_bins<double>)];
+
     ///Class to hold a reproducible summation of the numbers passed to it
     ///
-    ///@param ftype Floating-point data type; either `float` or `double`
+    ///@param ftype Floating-point data type; either `float` or `double
     ///@param FOLD  The fold; use 3 as a default unless you understand it.
-    template <class ftype, int FOLD = 3,
-        typename std::enable_if<std::is_floating_point<ftype>::value>::type* =
+    template <class ftype_, int FOLD_ = 3,
+        typename std::enable_if_t<std::is_floating_point<ftype_>::value>* =
             nullptr>
-    class ReproducibleFloatingAccumulator
+    class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator
     {
     public:
+        using ftype = ftype_;
+        static constexpr int FOLD = FOLD_;
+
+    private:
         std::array<ftype, 2 * FOLD> data = {0};
 
         ///Floating-point precision bin width
         static constexpr auto BIN_WIDTH =
-            std::is_same<ftype, double>::value ? 40 : 13;
+            std::is_same_v<ftype, double> ? 40 : 13;
         static constexpr auto MIN_EXP =
             std::numeric_limits<ftype>::min_exponent;
         static constexpr auto MAX_EXP =
@@ -91,43 +210,11 @@ namespace RFA {
         ///The number of deposits that can be performed before a renorm is necessary.
         ///Applies also to binned complex double precision.
         static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
-        // static_assert(ENDURANCE == 0);
-    public:
-        ///Generates binned floating-point reference bins
-        static constexpr std::array<ftype, MAXINDEX + MAXFOLD> initialize_bins()
-        {
-            std::array<ftype, MAXINDEX + MAXFOLD> bins{0};
 
-            if (std::is_same<ftype, float>::value)
-            {
-                bins[0] = ldexp_impl(0.75, MAX_EXP);
-            }
-            else
-            {
-                bins[0] = 2.0 * ldexp_impl(0.75, MAX_EXP - 1);
-            }
-
-            for (int index = 1; index <= MAXINDEX; index++)
-            {
-                bins[index] = ldexp_impl(0.75,
-                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
-            }
-            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
-            {
-                bins[index] = bins[index - 1];
-            }
-
-            return bins;
-        }
-
-        ///The binned floating-point reference bins
-        static std::array<ftype, MAXINDEX + MAXFOLD> bins;
-
-    private:
         ///Return a binned floating-point reference bin
-        static inline constexpr const ftype* binned_bins(const int x)
+        inline const ftype* binned_bins(const int x) const
         {
-            return &bins[x];
+            return &reinterpret_cast<RFA_bins<ftype>&>(bin_host_buffer)[x];
         }
 
         ///Get the bit representation of a float
@@ -151,33 +238,109 @@ namespace RFA {
             return *reinterpret_cast<const uint64_t*>(&x);
         }
 
-        ///Return a pointer to the primary vector
-        inline ftype* pvec()
+        ///Return primary vector value const ref
+        inline const ftype& primary(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[4];
+                default:
+                    return data[FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[i];
+            }
+        }
+
+        ///Return carry vector value const ref
+        inline const ftype& carry(int i) const
         {
-            return &data[0];
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[FOLD + 0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[FOLD + 1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[FOLD + 2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[FOLD + 3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[FOLD + 4];
+                default:
+                    return data[2 * FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[FOLD + i];
+            }
         }
-        ///Return a pointer to the carry vector
-        inline ftype* cvec()
+
+        ///Return primary vector value ref
+        inline ftype& primary(int i)
         {
-            return &data[FOLD];
+            const auto& c = *this;
+            return const_cast<ftype&>(c.primary(i));
         }
-        ///Return a const pointer to the primary vector
-        inline const ftype* pvec() const
+
+        ///Return carry vector value ref
+        inline ftype& carry(int i)
         {
-            return &data[0];
+            const auto& c = *this;
+            return const_cast<ftype&>(c.carry(i));
         }
-        ///Return a const pointer to the carry vector
-        inline const ftype* cvec() const
+
+#ifdef DISABLE_ZERO
+        static inline constexpr bool ISZERO(const ftype)
         {
-            return &data[FOLD];
+            return false;
         }
+#else
+        static inline constexpr bool ISZERO(const ftype x)
+        {
+            return x == 0.0;
+        }
+#endif
 
+#ifdef DISABLE_NANINF
+        static inline constexpr int ISNANINF(const ftype)
+        {
+            return false;
+        }
+#else
         static inline constexpr int ISNANINF(const ftype x)
         {
             const auto bits = get_bits(x);
             return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
                 ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
         }
+#endif
 
         static inline constexpr int EXP(const ftype x)
         {
@@ -200,8 +363,8 @@ namespace RFA {
                 }
                 else
                 {
-                    frexp(x, &exp);
-                    return std::min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                    std::frexp(x, &exp);
+                    return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
                 }
             }
             return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
@@ -213,7 +376,7 @@ namespace RFA {
         inline int binned_index() const
         {
             return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
-                       EXP(pvec()[0])) /
+                       EXP(primary(0))) /
                 BIN_WIDTH;
         }
 
@@ -221,7 +384,7 @@ namespace RFA {
         ///A quick check to determine if the index is 0
         inline bool binned_index0() const
         {
-            return EXP(pvec()[0]) == MAX_EXP + EXP_BIAS;
+            return EXP(primary(0)) == MAX_EXP + EXP_BIAS;
         }
 
         ///Update manually specified binned fp with a scalar (X -> Y)
@@ -234,43 +397,40 @@ namespace RFA {
         void binned_dmdupdate(
             const ftype max_abs_val, const int incpriY, const int inccarY)
         {
-            int i;
-            int j;
-            int X_index;
-            int shift;
-            auto* const priY = pvec();
-            auto* const carY = cvec();
-
-            if (ISNANINF(priY[0]))
-            {
+            if (ISNANINF(primary(0)))
                 return;
-            }
 
-            X_index = binned_dindex(max_abs_val);
-            if (priY[0] == 0.0)
+            int X_index = binned_dindex(max_abs_val);
+            if (ISZERO(primary(0)))
             {
                 const ftype* const bins = binned_bins(X_index);
-                for (i = 0; i < FOLD; i++)
+                for (int i = 0; i < FOLD; i++)
                 {
-                    priY[i * incpriY] = bins[i];
-                    carY[i * inccarY] = 0.0;
+                    primary(i * incpriY) = bins[i];
+                    carry(i * inccarY) = 0.0;
                 }
             }
             else
             {
-                shift = binned_index() - X_index;
+                int shift = binned_index() - X_index;
                 if (shift > 0)
                 {
-                    for (i = FOLD - 1; i >= shift; i--)
+#pragma unroll
+                    for (int i = FOLD - 1; i >= 1; i--)
                     {
-                        priY[i * incpriY] = priY[(i - shift) * incpriY];
-                        carY[i * inccarY] = carY[(i - shift) * inccarY];
+                        if (i < shift)
+                            break;
+                        primary(i * incpriY) = primary((i - shift) * incpriY);
+                        carry(i * inccarY) = carry((i - shift) * inccarY);
                     }
                     const ftype* const bins = binned_bins(X_index);
-                    for (j = 0; j < i + 1; j++)
+#pragma unroll
+                    for (int j = 0; j < FOLD; j++)
                     {
-                        priY[j * incpriY] = bins[j];
-                        carY[j * inccarY] = 0.0;
+                        if (j >= shift)
+                            break;
+                        primary(j * incpriY) = bins[j];
+                        carry(j * inccarY) = 0.0;
                     }
                 }
             }
@@ -285,59 +445,59 @@ namespace RFA {
         void binned_dmddeposit(const ftype X, const int incpriY)
         {
             ftype M;
-            int i;
             ftype x = X;
-            auto* const priY = pvec();
 
-            if (ISNANINF(x) || ISNANINF(priY[0]))
+            if (ISNANINF(x) || ISNANINF(primary(0)))
             {
-                priY[0] += x;
+                primary(0) += x;
                 return;
             }
 
             if (binned_index0())
             {
-                M = priY[0];
+                M = primary(0);
                 ftype qd = x * COMPRESSION;
                 auto& ql = get_bits(qd);
                 ql |= 1;
                 qd += M;
-                priY[0] = qd;
+                primary(0) = qd;
                 M -= qd;
                 M *= EXPANSION * 0.5;
                 x += M;
                 x += M;
-                for (i = 1; i < FOLD - 1; i++)
+#pragma unroll
+                for (int i = 1; i < FOLD - 1; i++)
                 {
-                    M = priY[i * incpriY];
+                    M = primary(i * incpriY);
                     qd = x;
                     ql |= 1;
                     qd += M;
-                    priY[i * incpriY] = qd;
+                    primary(i * incpriY) = qd;
                     M -= qd;
                     x += M;
                 }
                 qd = x;
                 ql |= 1;
-                priY[i * incpriY] += qd;
+                primary((FOLD - 1) * incpriY) += qd;
             }
             else
             {
                 ftype qd = x;
                 auto& ql = get_bits(qd);
-                for (i = 0; i < FOLD - 1; i++)
+#pragma unroll
+                for (int i = 0; i < FOLD - 1; i++)
                 {
-                    M = priY[i * incpriY];
+                    M = primary(i * incpriY);
                     qd = x;
                     ql |= 1;
                     qd += M;
-                    priY[i * incpriY] = qd;
+                    primary(i * incpriY) = qd;
                     M -= qd;
                     x += M;
                 }
                 qd = x;
                 ql |= 1;
-                priY[i * incpriY] += qd;
+                primary((FOLD - 1) * incpriY) += qd;
             }
         }
 
@@ -348,26 +508,22 @@ namespace RFA {
         ///
         ///@param incpriX stride within X's primary vector (use every incpriX'th element)
         ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        void binned_dmrenorm(const int incpriX, const int inccarX)
+        inline void binned_dmrenorm(const int incpriX, const int inccarX)
         {
-            auto* priX = pvec();
-            auto* carX = cvec();
-
-            if (priX[0] == 0.0 || ISNANINF(priX[0]))
-            {
+            if (ISZERO(primary(0)) || ISNANINF(primary(0)))
                 return;
-            }
 
-            for (int i = 0; i < FOLD; i++, priX += incpriX, carX += inccarX)
+            for (int i = 0; i < FOLD; i++)
             {
-                auto tmp_renormd = priX[0];
+                auto tmp_renormd = primary(i * incpriX);
                 auto& tmp_renorml = get_bits(tmp_renormd);
 
-                carX[0] += (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
+                carry(i * inccarX) +=
+                    (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
 
                 tmp_renorml &= ~(1ull << (MANT_DIG - 3));
                 tmp_renorml |= 1ull << (MANT_DIG - 2);
-                priX[0] = tmp_renormd;
+                primary(i * incpriX) = tmp_renormd;
             }
         }
 
@@ -392,18 +548,10 @@ namespace RFA {
         {
             int i = 0;
 
-            const auto* const priX = pvec();
-            const auto* const carX = cvec();
-
-            if (ISNANINF(priX[0]))
-            {
-                return priX[0];
-            }
-
-            if (priX[0] == 0.0)
-            {
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
                 return 0.0;
-            }
 
             double Y = 0.0;
             double scale_down;
@@ -413,30 +561,31 @@ namespace RFA {
             const auto* const bins = binned_bins(X_index);
             if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
             {
-                scale_down = ldexp_impl(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
-                scale_up = ldexp_impl(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
+                scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
+                scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
                 scaled = std::max(
                     std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
                 if (X_index == 0)
                 {
-                    Y += carX[0] * ((bins[0] / 6.0) * scale_down * EXPANSION);
-                    Y += carX[inccarX] * ((bins[1] / 6.0) * scale_down);
-                    Y += (priX[0] - bins[0]) * scale_down * EXPANSION;
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
+                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
+                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
                     i = 2;
                 }
                 else
                 {
-                    Y += carX[0] * ((bins[0] / 6.0) * scale_down);
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
                     i = 1;
                 }
                 for (; i < scaled; i++)
                 {
-                    Y += carX[i * inccarX] * ((bins[i] / 6.0) * scale_down);
-                    Y += (priX[(i - 1) * incpriX] - bins[i - 1]) * scale_down;
+                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
+                    Y +=
+                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
                 }
                 if (i == FOLD)
                 {
-                    Y += (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]) *
+                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
                         scale_down;
                     return Y * scale_up;
                 }
@@ -447,20 +596,20 @@ namespace RFA {
                 Y *= scale_up;
                 for (; i < FOLD; i++)
                 {
-                    Y += carX[i * inccarX] * (bins[i] / 6.0);
-                    Y += priX[(i - 1) * incpriX] - bins[i - 1];
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += primary((i - 1) * incpriX) - bins[i - 1];
                 }
-                Y += priX[(FOLD - 1) * incpriX] - bins[FOLD - 1];
+                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
             }
             else
             {
-                Y += carX[0] * (bins[0] / 6.0);
+                Y += carry(0) * (bins[0] / 6.0);
                 for (i = 1; i < FOLD; i++)
                 {
-                    Y += carX[i * inccarX] * (bins[i] / 6.0);
-                    Y += (priX[(i - 1) * incpriX] - bins[i - 1]);
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
                 }
-                Y += (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]);
+                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
             }
             return Y;
         }
@@ -473,18 +622,11 @@ namespace RFA {
         {
             int i = 0;
             double Y = 0.0;
-            const auto* const priX = pvec();
-            const auto* const carX = cvec();
-
-            if (ISNANINF(priX[0]))
-            {
-                return priX[0];
-            }
 
-            if (priX[0] == 0.0)
-            {
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
                 return 0.0;
-            }
 
             //Note that the following order of summation is in order of decreasing
             //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
@@ -493,23 +635,23 @@ namespace RFA {
             const auto* const bins = binned_bins(X_index);
             if (X_index == 0)
             {
-                Y += (double) carX[0] * (double) (bins[0] / 6.0) *
+                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
                     (double) EXPANSION;
-                Y += (double) carX[inccarX] * (double) (bins[1] / 6.0);
-                Y += (double) (priX[0] - bins[0]) * (double) EXPANSION;
+                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
+                Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
                 i = 2;
             }
             else
             {
-                Y += (double) carX[0] * (double) (bins[0] / 6.0);
+                Y += (double) carry(0) * (double) (bins[0] / 6.0);
                 i = 1;
             }
             for (; i < FOLD; i++)
             {
-                Y += (double) carX[i * inccarX] * (double) (bins[i] / 6.0);
-                Y += (double) (priX[(i - 1) * incpriX] - bins[i - 1]);
+                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
+                Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
             }
-            Y += (double) (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]);
+            Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
 
             return (float) Y;
         }
@@ -522,64 +664,78 @@ namespace RFA {
         ///@param inccarX stride within X's carry vector (use every inccarX'th element)
         ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
         ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdmadd(const ReproducibleFloatingAccumulator& other,
+        void binned_dmdmadd(const ReproducibleFloatingAccumulator& x,
             const int incpriX, const int inccarX, const int incpriY,
             const int inccarY)
         {
-            auto* const priX = other.pvec();
-            auto* const carX = other.cvec();
-            auto* const priY = pvec();
-            auto* const carY = cvec();
-
-            if (priX[0] == 0.0)
+            if (ISZERO(x.primary(0)))
                 return;
 
-            if (priY[0] == 0.0)
+            if (ISZERO(primary(0)))
             {
                 for (int i = 0; i < FOLD; i++)
                 {
-                    priY[i * incpriY] = priX[i * incpriX];
-                    carY[i * inccarY] = carX[i * inccarX];
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
                 }
                 return;
             }
 
-            if (ISNANINF(priX[0]) || ISNANINF(priY[0]))
+            if (ISNANINF(x.primary(0)) || ISNANINF(primary(0)))
             {
-                priY[0] += priX[0];
+                primary(0) += x.primary(0);
                 return;
             }
 
-            // const auto X_index = binned_index(priX);
-            const auto X_index = other.binned_index();
-            const auto Y_index = binned_index();
+            const auto X_index = x.binned_index();
+            const auto Y_index = this->binned_index();
             const auto shift = Y_index - X_index;
             if (shift > 0)
             {
                 const auto* const bins = binned_bins(Y_index);
                 //shift Y upwards and add X to Y
-                for (int i = FOLD - 1; i >= shift; i--)
+#pragma unroll
+                for (int i = FOLD - 1; i >= 1; i--)
                 {
-                    priY[i * incpriY] = priX[i * incpriX] +
-                        (priY[(i - shift) * incpriY] - bins[i - shift]);
-                    carY[i * inccarY] =
-                        carX[i * inccarX] + carY[(i - shift) * inccarY];
+                    if (i < shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX) +
+                        (primary((i - shift) * incpriY) - bins[i - shift]);
+                    carry(i * inccarY) =
+                        x.carry(i * inccarX) + carry((i - shift) * inccarY);
                 }
-                for (int i = 0; i < shift && i < FOLD; i++)
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
                 {
-                    priY[i * incpriY] = priX[i * incpriX];
-                    carY[i * inccarY] = carX[i * inccarX];
+                    if (i == shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
                 }
             }
-            else
+            else if (shift < 0)
             {
                 const auto* const bins = binned_bins(X_index);
                 //shift X upwards and add X to Y
-                for (int i = 0 - shift; i < FOLD; i++)
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
                 {
-                    priY[i * incpriY] +=
-                        priX[(i + shift) * incpriX] - bins[i + shift];
-                    carY[i * inccarY] += carX[(i + shift) * inccarX];
+                    if (i < -shift)
+                        continue;
+                    primary(i * incpriY) +=
+                        x.primary((i + shift) * incpriX) - bins[i + shift];
+                    carry(i * inccarY) += x.carry((i + shift) * inccarX);
+                }
+            }
+            else if (shift == 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+                // add X to Y
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
+                    carry(i * inccarY) += x.carry(i * inccarX);
                 }
             }
 
@@ -594,6 +750,13 @@ namespace RFA {
         }
 
     public:
+        ReproducibleFloatingAccumulator() = default;
+        ReproducibleFloatingAccumulator(
+            const ReproducibleFloatingAccumulator&) = default;
+        ///Sets this binned fp equal to another binned fp
+        ReproducibleFloatingAccumulator& operator=(
+            const ReproducibleFloatingAccumulator&) = default;
+
         ///Set the binned fp to zero
         void zero()
         {
@@ -601,22 +764,27 @@ namespace RFA {
         }
 
         ///Return the fold of the binned fp
-        int fold() const
+        constexpr int fold() const
         {
             return FOLD;
         }
 
+        ///Return the endurance of the binned fp
+        constexpr int endurance() const
+        {
+            return ENDURANCE;
+        }
+
         ///Returns the number of reference bins. Used for judging memory usage.
-        static constexpr size_t number_of_reference_bins()
+        constexpr size_t number_of_reference_bins()
         {
-            return bins.size();
+            return std::array<ftype, MAXINDEX + MAXFOLD>::size();
         }
 
         ///Accumulate an arithmetic @p x into the binned fp.
         ///NOTE: Casts @p x to the type of the binned fp
         template <typename U,
-            typename std::enable_if<std::is_arithmetic<U>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
         ReproducibleFloatingAccumulator& operator+=(const U x)
         {
             binned_dmdadd(static_cast<ftype>(x), 1, 1);
@@ -626,8 +794,7 @@ namespace RFA {
         ///Accumulate-subtract an arithmetic @p x into the binned fp.
         ///NOTE: Casts @p x to the type of the binned fp
         template <typename U,
-            typename std::enable_if<std::is_arithmetic<U>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
         ReproducibleFloatingAccumulator& operator-=(const U x)
         {
             binned_dmdadd(-static_cast<ftype>(x), 1, 1);
@@ -666,8 +833,7 @@ namespace RFA {
         ///Sets this binned fp equal to the arithmetic value @p x
         ///NOTE: Casts @p x to the type of the binned fp
         template <typename U,
-            typename std::enable_if<std::is_arithmetic<U>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
         ReproducibleFloatingAccumulator& operator=(const U x)
         {
             zero();
@@ -675,14 +841,6 @@ namespace RFA {
             return *this;
         }
 
-        ///Sets this binned fp equal to another binned fp
-        ReproducibleFloatingAccumulator& operator=(
-            const ReproducibleFloatingAccumulator<ftype, FOLD>& o)
-        {
-            data = o.data;
-            return *this;
-        }
-
         ///Returns the negative of this binned fp
         ///NOTE: Makes a copy and performs arithmetic; slow.
         ReproducibleFloatingAccumulator operator-()
@@ -690,14 +848,14 @@ namespace RFA {
             constexpr int incpriX = 1;
             constexpr int inccarX = 1;
             ReproducibleFloatingAccumulator temp = *this;
-            if (pvec()[0] != 0.0)
+            if (primary(0) != 0.0)
             {
                 const auto* const bins = binned_bins(binned_index());
                 for (int i = 0; i < FOLD; i++)
                 {
-                    temp.pvec()[i * incpriX] =
-                        bins[i] - (pvec()[i * incpriX] - bins[i]);
-                    temp.cvec()[i * inccarX] = -cvec()[i * inccarX];
+                    temp.primary(i * incpriX) =
+                        bins[i] - (primary(i * incpriX) - bins[i]);
+                    temp.carry(i * inccarX) = -carry(i * inccarX);
                 }
             }
             return temp;
@@ -706,7 +864,7 @@ namespace RFA {
         ///Convert this binned fp into its native floating-point representation
         ftype conv() const
         {
-            if (std::is_same<ftype, float>::value)
+            if (std::is_same_v<ftype, float>)
             {
                 return binned_conv_single(1, 1);
             }
@@ -729,9 +887,8 @@ namespace RFA {
         {
             const double X = std::abs(max_abs_val);
             const double S = std::abs(binned_sum);
-            return static_cast<ftype>(
-                std::max(X, ldexp_impl(0.5, MIN_EXP - 1)) *
-                    ldexp_impl(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
+            return static_cast<ftype>(max(X, std::ldexp(0.5, MIN_EXP - 1)) *
+                    std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
                 ((7.0 * EPSILON) /
                     (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
                         7.0 * EPSILON)) *
@@ -754,10 +911,12 @@ namespace RFA {
         {
             binned_dmdupdate(std::abs(max_abs_val), 1, 1);
             size_t count = 0;
+            size_t N = last - first;
             for (; first != last; first++, count++)
             {
                 binned_dmddeposit(static_cast<ftype>(*first), 1);
-                if (count == ENDURANCE)
+                // first conditional allows compiler to remove the call here when possible
+                if (N > ENDURANCE && count == ENDURANCE)
                 {
                     binned_dmrenorm(1, 1);
                     count = 0;
@@ -788,14 +947,11 @@ namespace RFA {
         ///@param N           Number of elements to add
         ///@param max_abs_val Maximum absolute value of any member of the range
         template <typename T,
-            typename std::enable_if<std::is_arithmetic<T>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
         void add(const T* input, const size_t N, const ftype max_abs_val)
         {
             if (N == 0)
-            {
                 return;
-            }
             add(input, input + N, max_abs_val);
         }
 
@@ -807,19 +963,147 @@ namespace RFA {
         ///@param input       Start of the range
         ///@param N           Number of elements to add
         template <typename T,
-            typename std::enable_if<std::is_arithmetic<T>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
         void add(const T* input, const size_t N)
         {
             if (N == 0)
+                return;
+
+            T max_abs_val = input[0];
+            for (size_t i = 0; i < N; i++)
+            {
+                max_abs_val = max(max_abs_val, std::abs(input[i]));
+            }
+            add(input, N, max_abs_val);
+        }
+
+        ///Accumulate a float4 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float4& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            binned_dmddeposit(static_cast<ftype>(x.z), 1);
+            binned_dmddeposit(static_cast<ftype>(x.w), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const double2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        void add(const float4* input, const size_t N, float max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
             {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].z), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].w), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const double2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
                 return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
             }
-            T max_abs_val = input[0];
+        }
+
+        void add(const float2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
             for (size_t i = 0; i < N; i++)
             {
-                max_abs_val = std::max(max_abs_val, std::abs(input[i]));
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
             }
+        }
+
+        void add(const float4* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const double2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const float2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
             add(input, N, max_abs_val);
         }
 
@@ -833,8 +1117,7 @@ namespace RFA {
         ///with `unsafe_add` after which `renorm()` must be called. See the source of
         ///`add()` for an example
         template <typename T,
-            typename std::enable_if<std::is_arithmetic<T>::value>::type* =
-                nullptr>
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
         void set_max_abs_val(const T mav)
         {
             binned_dmdupdate(std::abs(mav), 1, 1);
@@ -858,11 +1141,4 @@ namespace RFA {
         }
     };
 
-    template <>
-    auto ReproducibleFloatingAccumulator<float>::bins =
-        ReproducibleFloatingAccumulator<float>::initialize_bins();
-    
-    template <>
-    auto ReproducibleFloatingAccumulator<double>::bins =
-        ReproducibleFloatingAccumulator<double>::initialize_bins();
-}    // namespace RFA
+}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp
deleted file mode 100644
index 05f71d9ae746..000000000000
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp
+++ /dev/null
@@ -1,1168 +0,0 @@
-//Reproducible Floating Point Accumulations via Binned Floating Point
-//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
-//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
-//
-//The code accomplishes several objectives:
-//
-//1. Reproducible summation, independent of summation order, assuming only a
-//   subset of the IEEE 754 Floating Point Standard
-//
-//2. Has accuracy at least as good as conventional summation, and tunable
-//
-//3. Handles overflow, underflow, and other exceptions reproducibly.
-//
-//4. Makes only one read-only pass over the summands.
-//
-//5. Requires only one parallel reduction.
-//
-//6. Uses minimal memory (6 doubles per accumulator with fold=3).
-//
-//7. Relatively easy to use
-
-#pragma once
-
-#include <algorithm>
-#include <climits>
-#include <cmath>
-#include <cstdint>
-#include <limits>
-
-#ifndef __CUDACC__
-#define __host__
-#define __device__
-#define __forceinline__
-#include <array>
-using std::array;
-using std::max;
-using std::min;
-#else
-#include <cuda/std/array>
-using cuda::std::array;
-using cuda::std::max;
-using cuda::std::min;
-#include "vector.hpp"
-#endif
-
-namespace hpx::parallel::detail::rfa {
-    template <typename F>
-    struct type4
-    {
-        F x;
-        F y;
-        F z;
-        F w;
-    };
-
-    template <typename F>
-    struct type2
-    {
-        F x;
-        F y;
-    };
-    using float4 = type4<float>;
-    using double4 = type4<double>;
-    using float2 = type2<float>;
-    using double2 = type2<double>;
-
-    auto abs_max(float4 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        auto z = std::abs(a.z);
-        auto w = std::abs(a.w);
-        const std::vector<float> v = {x, y, z, w};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-    auto abs_max(double4 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        auto z = std::abs(a.z);
-        auto w = std::abs(a.w);
-        const std::vector<double> v = {x, y, z, w};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-    auto abs_max(float2 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        const std::vector<float> v = {x, y};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-    auto abs_max(double2 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        const std::vector<double> v = {x, y};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-// disable zero checks
-#define DISABLE_ZERO
-
-// disable nan / infinity checks
-#define DISABLE_NANINF
-
-// jump table for indexing into data
-#define MAX_JUMP 5
-    static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max");
-
-    template <typename Real>
-    inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
-    {
-        return std::ldexp(arg, exp);
-        // while (arg == 0)
-        // {
-        //     return arg;
-        // }
-        // while (exp > 0)
-        // {
-        //     arg *= static_cast<Real>(2);
-        //     --exp;
-        // }
-        // while (exp < 0)
-        // {
-        //     arg /= static_cast<Real>(2);
-        //     ++exp;
-        // }
-
-        // return arg;
-    }
-
-    template <class ftype>
-    struct RFA_bins
-    {
-        static constexpr auto BIN_WIDTH =
-            std::is_same_v<ftype, double> ? 40 : 13;
-        static constexpr auto MIN_EXP =
-            std::numeric_limits<ftype>::min_exponent;
-        static constexpr auto MAX_EXP =
-            std::numeric_limits<ftype>::max_exponent;
-        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
-        ///Binned floating-point maximum index
-        static constexpr auto MAXINDEX =
-            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
-        //The maximum floating-point fold supported by the library
-        static constexpr auto MAXFOLD = MAXINDEX + 1;
-
-        ///The binned floating-point reference bins
-        array<ftype, MAXINDEX + MAXFOLD> bins = {};
-
-        constexpr ftype& operator[](int d)
-        {
-            return bins[d];
-        }
-
-        void initialize_bins()
-        {
-            if constexpr (std::is_same_v<ftype, float>)
-            {
-                bins[0] = std::ldexp(0.75, MAX_EXP);
-            }
-            else
-            {
-                bins[0] = 2.0 * ldexp(0.75, MAX_EXP - 1);
-            }
-
-            for (int index = 1; index <= MAXINDEX; index++)
-            {
-                bins[index] = ldexp(0.75,
-                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
-            }
-            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
-            {
-                bins[index] = bins[index - 1];
-            }
-        }
-    };
-
-    static char bin_host_buffer[sizeof(RFA_bins<double>)];
-#ifdef __CUDACC__
-    __constant__ static char bin_device_buffer[sizeof(RFA_bins<double>)];
-#endif
-
-    ///Class to hold a reproducible summation of the numbers passed to it
-    ///
-    ///@param ftype Floating-point data type; either `float` or `double
-    ///@param FOLD  The fold; use 3 as a default unless you understand it.
-    template <class ftype_, int FOLD_ = 3,
-        typename std::enable_if_t<std::is_floating_point<ftype_>::value>* =
-            nullptr>
-    class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator
-    {
-    public:
-        using ftype = ftype_;
-        static constexpr int FOLD = FOLD_;
-
-    private:
-        array<ftype, 2 * FOLD> data = {0};
-
-        ///Floating-point precision bin width
-        static constexpr auto BIN_WIDTH =
-            std::is_same_v<ftype, double> ? 40 : 13;
-        static constexpr auto MIN_EXP =
-            std::numeric_limits<ftype>::min_exponent;
-        static constexpr auto MAX_EXP =
-            std::numeric_limits<ftype>::max_exponent;
-        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
-        ///Binned floating-point maximum index
-        static constexpr auto MAXINDEX =
-            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
-        //The maximum floating-point fold supported by the library
-        static constexpr auto MAXFOLD = MAXINDEX + 1;
-        ///Binned floating-point compression factor
-        ///This factor is used to scale down inputs before deposition into the bin of
-        ///highest index
-        static constexpr auto COMPRESSION =
-            1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1));
-        ///Binned double precision expansion factor
-        ///This factor is used to scale up inputs after deposition into the bin of
-        ///highest index
-        static constexpr auto EXPANSION =
-            1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1));
-        static constexpr auto EXP_BIAS = MAX_EXP - 2;
-        static constexpr auto EPSILON = std::numeric_limits<ftype>::epsilon();
-        ///Binned floating-point deposit endurance
-        ///The number of deposits that can be performed before a renorm is necessary.
-        ///Applies also to binned complex double precision.
-        static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
-
-        ///Return a binned floating-point reference bin
-        inline const ftype* binned_bins(const int x) const
-        {
-#ifdef __CUDA_ARCH__    // must be arch not CC here
-            return &reinterpret_cast<RFA_bins<ftype>&>(bin_device_buffer)[x];
-#else
-            return &reinterpret_cast<RFA_bins<ftype>&>(bin_host_buffer)[x];
-#endif
-        }
-
-        ///Get the bit representation of a float
-        static inline uint32_t& get_bits(float& x)
-        {
-            return *reinterpret_cast<uint32_t*>(&x);
-        }
-        ///Get the bit representation of a double
-        static inline uint64_t& get_bits(double& x)
-        {
-            return *reinterpret_cast<uint64_t*>(&x);
-        }
-        ///Get the bit representation of a const float
-        static inline uint32_t get_bits(const float& x)
-        {
-            return *reinterpret_cast<const uint32_t*>(&x);
-        }
-        ///Get the bit representation of a const double
-        static inline uint64_t get_bits(const double& x)
-        {
-            return *reinterpret_cast<const uint64_t*>(&x);
-        }
-
-        ///Return primary vector value const ref
-        inline const ftype& primary(int i) const
-        {
-            if constexpr (FOLD <= MAX_JUMP)
-            {
-                switch (i)
-                {
-                case 0:
-                    if constexpr (FOLD >= 1)
-                        return data[0];
-                case 1:
-                    if constexpr (FOLD >= 2)
-                        return data[1];
-                case 2:
-                    if constexpr (FOLD >= 3)
-                        return data[2];
-                case 3:
-                    if constexpr (FOLD >= 4)
-                        return data[3];
-                case 4:
-                    if constexpr (FOLD >= 5)
-                        return data[4];
-                default:
-                    return data[FOLD - 1];
-                }
-            }
-            else
-            {
-                return data[i];
-            }
-        }
-
-        ///Return carry vector value const ref
-        inline const ftype& carry(int i) const
-        {
-            if constexpr (FOLD <= MAX_JUMP)
-            {
-                switch (i)
-                {
-                case 0:
-                    if constexpr (FOLD >= 1)
-                        return data[FOLD + 0];
-                case 1:
-                    if constexpr (FOLD >= 2)
-                        return data[FOLD + 1];
-                case 2:
-                    if constexpr (FOLD >= 3)
-                        return data[FOLD + 2];
-                case 3:
-                    if constexpr (FOLD >= 4)
-                        return data[FOLD + 3];
-                case 4:
-                    if constexpr (FOLD >= 5)
-                        return data[FOLD + 4];
-                default:
-                    return data[2 * FOLD - 1];
-                }
-            }
-            else
-            {
-                return data[FOLD + i];
-            }
-        }
-
-        ///Return primary vector value ref
-        inline ftype& primary(int i)
-        {
-            const auto& c = *this;
-            return const_cast<ftype&>(c.primary(i));
-        }
-
-        ///Return carry vector value ref
-        inline ftype& carry(int i)
-        {
-            const auto& c = *this;
-            return const_cast<ftype&>(c.carry(i));
-        }
-
-#ifdef DISABLE_ZERO
-        static inline constexpr bool ISZERO(const ftype)
-        {
-            return false;
-        }
-#else
-        static inline constexpr bool ISZERO(const ftype x)
-        {
-            return x == 0.0;
-        }
-#endif
-
-#ifdef DISABLE_NANINF
-        static inline constexpr int ISNANINF(const ftype)
-        {
-            return false;
-        }
-#else
-        static inline constexpr int ISNANINF(const ftype x)
-        {
-            const auto bits = get_bits(x);
-            return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
-                ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
-        }
-#endif
-
-        static inline constexpr int EXP(const ftype x)
-        {
-            const auto bits = get_bits(x);
-            return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1);
-        }
-
-        ///Get index of float-point precision
-        ///The index of a non-binned type is the smallest index a binned type would
-        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
-        ///bins.
-        static inline constexpr int binned_dindex(const ftype x)
-        {
-            int exp = EXP(x);
-            if (exp == 0)
-            {
-                if (x == 0.0)
-                {
-                    return MAXINDEX;
-                }
-                else
-                {
-                    frexp(x, &exp);
-                    return min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
-                }
-            }
-            return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
-        }
-
-        ///Get index of manually specified binned double precision
-        ///The index of a binned type is the bin that it corresponds to. Higher
-        ///indicies correspond to smaller bins.
-        inline int binned_index() const
-        {
-            return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
-                       EXP(primary(0))) /
-                BIN_WIDTH;
-        }
-
-        ///Check if index of manually specified binned floating-point is 0
-        ///A quick check to determine if the index is 0
-        inline bool binned_index0() const
-        {
-            return EXP(primary(0)) == MAX_EXP + EXP_BIAS;
-        }
-
-        ///Update manually specified binned fp with a scalar (X -> Y)
-        ///
-        ///This method updates the binned fp to an index suitable for adding numbers
-        ///with absolute value less than @p max_abs_val
-        ///
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdupdate(
-            const ftype max_abs_val, const int incpriY, const int inccarY)
-        {
-            if (ISNANINF(primary(0)))
-                return;
-
-            int X_index = binned_dindex(max_abs_val);
-            if (ISZERO(primary(0)))
-            {
-                const ftype* const bins = binned_bins(X_index);
-                for (int i = 0; i < FOLD; i++)
-                {
-                    primary(i * incpriY) = bins[i];
-                    carry(i * inccarY) = 0.0;
-                }
-            }
-            else
-            {
-                int shift = binned_index() - X_index;
-                if (shift > 0)
-                {
-#pragma unroll
-                    for (int i = FOLD - 1; i >= 1; i--)
-                    {
-                        if (i < shift)
-                            break;
-                        primary(i * incpriY) = primary((i - shift) * incpriY);
-                        carry(i * inccarY) = carry((i - shift) * inccarY);
-                    }
-                    const ftype* const bins = binned_bins(X_index);
-#pragma unroll
-                    for (int j = 0; j < FOLD; j++)
-                    {
-                        if (j >= shift)
-                            break;
-                        primary(j * incpriY) = bins[j];
-                        carry(j * inccarY) = 0.0;
-                    }
-                }
-            }
-        }
-
-        ///Add scalar @p X to suitably binned manually specified binned fp (Y += X)
-        ///
-        ///Performs the operation Y += X on an binned type Y where the index of Y is
-        ///larger than the index of @p X
-        ///
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        void binned_dmddeposit(const ftype X, const int incpriY)
-        {
-            ftype M;
-            ftype x = X;
-
-            if (ISNANINF(x) || ISNANINF(primary(0)))
-            {
-                primary(0) += x;
-                return;
-            }
-
-            if (binned_index0())
-            {
-                M = primary(0);
-                ftype qd = x * COMPRESSION;
-                auto& ql = get_bits(qd);
-                ql |= 1;
-                qd += M;
-                primary(0) = qd;
-                M -= qd;
-                M *= EXPANSION * 0.5;
-                x += M;
-                x += M;
-#pragma unroll
-                for (int i = 1; i < FOLD - 1; i++)
-                {
-                    M = primary(i * incpriY);
-                    qd = x;
-                    ql |= 1;
-                    qd += M;
-                    primary(i * incpriY) = qd;
-                    M -= qd;
-                    x += M;
-                }
-                qd = x;
-                ql |= 1;
-                primary((FOLD - 1) * incpriY) += qd;
-            }
-            else
-            {
-                ftype qd = x;
-                auto& ql = get_bits(qd);
-#pragma unroll
-                for (int i = 0; i < FOLD - 1; i++)
-                {
-                    M = primary(i * incpriY);
-                    qd = x;
-                    ql |= 1;
-                    qd += M;
-                    primary(i * incpriY) = qd;
-                    M -= qd;
-                    x += M;
-                }
-                qd = x;
-                ql |= 1;
-                primary((FOLD - 1) * incpriY) += qd;
-            }
-        }
-
-        ///Renormalize manually specified binned double precision
-        ///
-        ///Renormalization keeps the primary vector within the necessary bins by
-        ///shifting over to the carry vector
-        ///
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        inline void binned_dmrenorm(const int incpriX, const int inccarX)
-        {
-            if (ISZERO(primary(0)) || ISNANINF(primary(0)))
-                return;
-
-            for (int i = 0; i < FOLD; i++)
-            {
-                auto tmp_renormd = primary(i * incpriX);
-                auto& tmp_renorml = get_bits(tmp_renormd);
-
-                carry(i * inccarX) +=
-                    (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
-
-                tmp_renorml &= ~(1ull << (MANT_DIG - 3));
-                tmp_renorml |= 1ull << (MANT_DIG - 2);
-                primary(i * incpriX) = tmp_renormd;
-            }
-        }
-
-        ///Add scalar to manually specified binned fp (Y += X)
-        ///
-        ///Performs the operation Y += X on an binned type Y
-        ///
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdadd(const ftype X, const int incpriY, const int inccarY)
-        {
-            binned_dmdupdate(X, incpriY, inccarY);
-            binned_dmddeposit(X, incpriY);
-            binned_dmrenorm(incpriY, inccarY);
-        }
-
-        ///Convert manually specified binned fp to native double-precision (X -> Y)
-        ///
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        double binned_conv_double(const int incpriX, const int inccarX) const
-        {
-            int i = 0;
-
-            if (ISNANINF(primary(0)))
-                return primary(0);
-            if (ISZERO(primary(0)))
-                return 0.0;
-
-            double Y = 0.0;
-            double scale_down;
-            double scale_up;
-            int scaled;
-            const auto X_index = binned_index();
-            const auto* const bins = binned_bins(X_index);
-            if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
-            {
-                scale_down = ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
-                scale_up = ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
-                scaled =
-                    max(min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
-                if (X_index == 0)
-                {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
-                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
-                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
-                    i = 2;
-                }
-                else
-                {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
-                    i = 1;
-                }
-                for (; i < scaled; i++)
-                {
-                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
-                    Y +=
-                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
-                }
-                if (i == FOLD)
-                {
-                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
-                        scale_down;
-                    return Y * scale_up;
-                }
-                if (std::isinf(Y * scale_up))
-                {
-                    return Y * scale_up;
-                }
-                Y *= scale_up;
-                for (; i < FOLD; i++)
-                {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += primary((i - 1) * incpriX) - bins[i - 1];
-                }
-                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
-            }
-            else
-            {
-                Y += carry(0) * (bins[0] / 6.0);
-                for (i = 1; i < FOLD; i++)
-                {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
-                }
-                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
-            }
-            return Y;
-        }
-
-        ///Convert manually specified binned fp to native single-precision (X -> Y)
-        ///
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        float binned_conv_single(const int incpriX, const int inccarX) const
-        {
-            int i = 0;
-            double Y = 0.0;
-
-            if (ISNANINF(primary(0)))
-                return primary(0);
-            if (ISZERO(primary(0)))
-                return 0.0;
-
-            //Note that the following order of summation is in order of decreasing
-            //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
-            //the number of carries equal to 1.
-            const auto X_index = binned_index();
-            const auto* const bins = binned_bins(X_index);
-            if (X_index == 0)
-            {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
-                    (double) EXPANSION;
-                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
-                Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
-                i = 2;
-            }
-            else
-            {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0);
-                i = 1;
-            }
-            for (; i < FOLD; i++)
-            {
-                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
-                Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
-            }
-            Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
-
-            return (float) Y;
-        }
-
-        ///Add two manually specified binned fp (Y += X)
-        ///Performs the operation Y += X
-        ///
-        ///@param other   Another binned fp of the same type
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdmadd(const ReproducibleFloatingAccumulator& x,
-            const int incpriX, const int inccarX, const int incpriY,
-            const int inccarY)
-        {
-            if (ISZERO(x.primary(0)))
-                return;
-
-            if (ISZERO(primary(0)))
-            {
-                for (int i = 0; i < FOLD; i++)
-                {
-                    primary(i * incpriY) = x.primary(i * incpriX);
-                    carry(i * inccarY) = x.carry(i * inccarX);
-                }
-                return;
-            }
-
-            if (ISNANINF(x.primary(0)) || ISNANINF(primary(0)))
-            {
-                primary(0) += x.primary(0);
-                return;
-            }
-
-            const auto X_index = x.binned_index();
-            const auto Y_index = this->binned_index();
-            const auto shift = Y_index - X_index;
-            if (shift > 0)
-            {
-                const auto* const bins = binned_bins(Y_index);
-                //shift Y upwards and add X to Y
-#pragma unroll
-                for (int i = FOLD - 1; i >= 1; i--)
-                {
-                    if (i < shift)
-                        break;
-                    primary(i * incpriY) = x.primary(i * incpriX) +
-                        (primary((i - shift) * incpriY) - bins[i - shift]);
-                    carry(i * inccarY) =
-                        x.carry(i * inccarX) + carry((i - shift) * inccarY);
-                }
-#pragma unroll
-                for (int i = 0; i < FOLD; i++)
-                {
-                    if (i == shift)
-                        break;
-                    primary(i * incpriY) = x.primary(i * incpriX);
-                    carry(i * inccarY) = x.carry(i * inccarX);
-                }
-            }
-            else if (shift < 0)
-            {
-                const auto* const bins = binned_bins(X_index);
-                //shift X upwards and add X to Y
-#pragma unroll
-                for (int i = 0; i < FOLD; i++)
-                {
-                    if (i < -shift)
-                        continue;
-                    primary(i * incpriY) +=
-                        x.primary((i + shift) * incpriX) - bins[i + shift];
-                    carry(i * inccarY) += x.carry((i + shift) * inccarX);
-                }
-            }
-            else if (shift == 0)
-            {
-                const auto* const bins = binned_bins(X_index);
-                // add X to Y
-#pragma unroll
-                for (int i = 0; i < FOLD; i++)
-                {
-                    primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
-                    carry(i * inccarY) += x.carry(i * inccarX);
-                }
-            }
-
-            binned_dmrenorm(incpriY, inccarY);
-        }
-
-        ///Add two manually specified binned fp (Y += X)
-        ///Performs the operation Y += X
-        void binned_dbdbadd(const ReproducibleFloatingAccumulator& other)
-        {
-            binned_dmdmadd(other, 1, 1, 1, 1);
-        }
-
-    public:
-        ReproducibleFloatingAccumulator() = default;
-        ReproducibleFloatingAccumulator(
-            const ReproducibleFloatingAccumulator&) = default;
-        ///Sets this binned fp equal to another binned fp
-        ReproducibleFloatingAccumulator& operator=(
-            const ReproducibleFloatingAccumulator&) = default;
-
-        ///Set the binned fp to zero
-        void zero()
-        {
-            data = {0};
-        }
-
-        ///Return the fold of the binned fp
-        constexpr int fold() const
-        {
-            return FOLD;
-        }
-
-        ///Return the endurance of the binned fp
-        constexpr int endurance() const
-        {
-            return ENDURANCE;
-        }
-
-        ///Returns the number of reference bins. Used for judging memory usage.
-        constexpr size_t number_of_reference_bins()
-        {
-            return array<ftype, MAXINDEX + MAXFOLD>::size();
-        }
-
-        ///Accumulate an arithmetic @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        template <typename U,
-            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
-        ReproducibleFloatingAccumulator& operator+=(const U x)
-        {
-            binned_dmdadd(static_cast<ftype>(x), 1, 1);
-            return *this;
-        }
-
-        ///Accumulate-subtract an arithmetic @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        template <typename U,
-            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
-        ReproducibleFloatingAccumulator& operator-=(const U x)
-        {
-            binned_dmdadd(-static_cast<ftype>(x), 1, 1);
-            return *this;
-        }
-
-        ///Accumulate a binned fp @p x into the binned fp.
-        ReproducibleFloatingAccumulator& operator+=(
-            const ReproducibleFloatingAccumulator& other)
-        {
-            binned_dbdbadd(other);
-            return *this;
-        }
-
-        ///Accumulate-subtract a binned fp @p x into the binned fp.
-        ///NOTE: Makes a copy and performs arithmetic; slow.
-        ReproducibleFloatingAccumulator& operator-=(
-            const ReproducibleFloatingAccumulator& other)
-        {
-            const auto temp = -other;
-            binned_dbdbadd(temp);
-        }
-
-        ///Determines if two binned fp are equal
-        bool operator==(const ReproducibleFloatingAccumulator& other) const
-        {
-            return data == other.data;
-        }
-
-        ///Determines if two binned fp are not equal
-        bool operator!=(const ReproducibleFloatingAccumulator& other) const
-        {
-            return !operator==(other);
-        }
-
-        ///Sets this binned fp equal to the arithmetic value @p x
-        ///NOTE: Casts @p x to the type of the binned fp
-        template <typename U,
-            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
-        ReproducibleFloatingAccumulator& operator=(const U x)
-        {
-            zero();
-            binned_dmdadd(static_cast<ftype>(x), 1, 1);
-            return *this;
-        }
-
-        ///Returns the negative of this binned fp
-        ///NOTE: Makes a copy and performs arithmetic; slow.
-        ReproducibleFloatingAccumulator operator-()
-        {
-            constexpr int incpriX = 1;
-            constexpr int inccarX = 1;
-            ReproducibleFloatingAccumulator temp = *this;
-            if (primary(0) != 0.0)
-            {
-                const auto* const bins = binned_bins(binned_index());
-                for (int i = 0; i < FOLD; i++)
-                {
-                    temp.primary(i * incpriX) =
-                        bins[i] - (primary(i * incpriX) - bins[i]);
-                    temp.carry(i * inccarX) = -carry(i * inccarX);
-                }
-            }
-            return temp;
-        }
-
-        ///Convert this binned fp into its native floating-point representation
-        ftype conv() const
-        {
-            if (std::is_same_v<ftype, float>)
-            {
-                return binned_conv_single(1, 1);
-            }
-            else
-            {
-                return binned_conv_double(1, 1);
-            }
-        }
-
-        ///@brief Get binned fp summation error bound
-        ///
-        ///This is a bound on the absolute error of a summation using binned types
-        ///
-        ///@param N           The number of single precision floating point summands
-        ///@param max_abs_val The summand of maximum absolute value
-        ///@param binned_sum  The value of the sum computed using binned types
-        ///@return            The absolute error bound
-        static constexpr ftype error_bound(
-            const uint64_t N, const ftype max_abs_val, const ftype binned_sum)
-        {
-            const double X = std::abs(max_abs_val);
-            const double S = std::abs(binned_sum);
-            return static_cast<ftype>(max(X, ldexp(0.5, MIN_EXP - 1)) *
-                    ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
-                ((7.0 * EPSILON) /
-                    (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
-                        7.0 * EPSILON)) *
-                    S);
-        }
-
-        ///Add @p x to the binned fp
-        void add(const ftype x)
-        {
-            binned_dmdadd(x, 1, 1);
-        }
-
-        ///Add arithmetics in the range [first, last) to the binned fp
-        ///
-        ///@param first       Start of range
-        ///@param last        End of range
-        ///@param max_abs_val Maximum absolute value of any member of the range
-        template <typename InputIt>
-        void add(InputIt first, InputIt last, const ftype max_abs_val)
-        {
-            binned_dmdupdate(std::abs(max_abs_val), 1, 1);
-            size_t count = 0;
-            size_t N = last - first;
-            for (; first != last; first++, count++)
-            {
-                binned_dmddeposit(static_cast<ftype>(*first), 1);
-                // first conditional allows compiler to remove the call here when possible
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        ///Add arithmetics in the range [first, last) to the binned fp
-        ///
-        ///NOTE: A maximum absolute value is calculated, so two passes are made over
-        ///      the data
-        ///
-        ///@param first       Start of range
-        ///@param last        End of range
-        template <typename InputIt>
-        void add(InputIt first, InputIt last)
-        {
-            const auto max_abs_val = *std::max_element(
-                first, last, [](const auto& a, const auto& b) {
-                    return std::abs(a) < std::abs(b);
-                });
-            add(first, last, static_cast<ftype>(max_abs_val));
-        }
-
-        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
-        ///
-        ///@param input       Start of the range
-        ///@param N           Number of elements to add
-        ///@param max_abs_val Maximum absolute value of any member of the range
-        template <typename T,
-            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
-        void add(const T* input, const size_t N, const ftype max_abs_val)
-        {
-            if (N == 0)
-                return;
-            add(input, input + N, max_abs_val);
-        }
-
-        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
-        ///
-        ///NOTE: A maximum absolute value is calculated, so two passes are made over
-        ///      the data
-        ///
-        ///@param input       Start of the range
-        ///@param N           Number of elements to add
-        template <typename T,
-            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
-        void add(const T* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            T max_abs_val = input[0];
-            for (size_t i = 0; i < N; i++)
-            {
-                max_abs_val = max(max_abs_val, std::abs(input[i]));
-            }
-            add(input, N, max_abs_val);
-        }
-
-        ///Accumulate a float4 @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        ReproducibleFloatingAccumulator& operator+=(const float4& x)
-        {
-            binned_dmdupdate(abs_max(x), 1, 1);
-            binned_dmddeposit(static_cast<ftype>(x.x), 1);
-            binned_dmddeposit(static_cast<ftype>(x.y), 1);
-            binned_dmddeposit(static_cast<ftype>(x.z), 1);
-            binned_dmddeposit(static_cast<ftype>(x.w), 1);
-            return *this;
-        }
-
-        ///Accumulate a double2 @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        ReproducibleFloatingAccumulator& operator+=(const float2& x)
-        {
-            binned_dmdupdate(abs_max(x), 1, 1);
-            binned_dmddeposit(static_cast<ftype>(x.x), 1);
-            binned_dmddeposit(static_cast<ftype>(x.y), 1);
-            return *this;
-        }
-
-        ///Accumulate a double2 @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        ReproducibleFloatingAccumulator& operator+=(const double2& x)
-        {
-            binned_dmdupdate(abs_max(x), 1, 1);
-            binned_dmddeposit(static_cast<ftype>(x.x), 1);
-            binned_dmddeposit(static_cast<ftype>(x.y), 1);
-            return *this;
-        }
-
-        void add(const float4* input, const size_t N, float max_abs_val)
-        {
-            if (N == 0)
-                return;
-            binned_dmdupdate(max_abs_val, 1, 1);
-
-            size_t count = 0;
-            for (size_t i = 0; i < N; i++)
-            {
-                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].z), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].w), 1);
-
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        void add(const double2* input, const size_t N, double max_abs_val)
-        {
-            if (N == 0)
-                return;
-            binned_dmdupdate(max_abs_val, 1, 1);
-
-            size_t count = 0;
-            for (size_t i = 0; i < N; i++)
-            {
-                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
-
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        void add(const float2* input, const size_t N, double max_abs_val)
-        {
-            if (N == 0)
-                return;
-            binned_dmdupdate(max_abs_val, 1, 1);
-
-            size_t count = 0;
-            for (size_t i = 0; i < N; i++)
-            {
-                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
-
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        void add(const float4* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            auto max_abs_val = abs_max(input[0]);
-            for (size_t i = 1; i < N; i++)
-                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
-
-            add(input, N, max_abs_val);
-        }
-
-        void add(const double2* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            auto max_abs_val = abs_max(input[0]);
-            for (size_t i = 1; i < N; i++)
-                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
-
-            add(input, N, max_abs_val);
-        }
-
-        void add(const float2* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            auto max_abs_val = abs_max(input[0]);
-            for (size_t i = 1; i < N; i++)
-                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
-
-            add(input, N, max_abs_val);
-        }
-
-        //////////////////////////////////////
-        //MANUAL OPERATIONS; USE WISELY
-        //////////////////////////////////////
-
-        ///Rebins for repeated accumulation of scalars with magnitude <= @p mav
-        ///
-        ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator
-        ///with `unsafe_add` after which `renorm()` must be called. See the source of
-        ///`add()` for an example
-        template <typename T,
-            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
-        void set_max_abs_val(const T mav)
-        {
-            binned_dmdupdate(std::abs(mav), 1, 1);
-        }
-
-        ///Add @p x to the binned fp
-        ///
-        ///This is intended to be used after a call to `set_max_abs_val()`
-        void unsafe_add(const ftype x)
-        {
-            binned_dmddeposit(x, 1);
-        }
-
-        ///Renormalizes the binned fp
-        ///
-        ///This is intended to be used after a call to `set_max_abs_val()` and one or
-        ///more calls to `unsafe_add()`
-        void renorm()
-        {
-            binned_dmrenorm(1, 1);
-        }
-    };
-
-    
-}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 462b1473f332..0a67dea03026 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -8,7 +8,7 @@
 
 #include <hpx/init.hpp>
 #include <hpx/modules/testing.hpp>
-#include <hpx/parallel/algorithms/detail/rfa_cuda.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
 #include <hpx/parallel/algorithms/reduce.hpp>
 #include <hpx/parallel/algorithms/reduce_deterministic.hpp>
 

From 6a99390451e0f3f261a0f8aaa1ffe63e1fa6b37b Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Mon, 9 Dec 2024 18:52:14 -0600
Subject: [PATCH 6/9] Added parallel execution of rfa reduction summation

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           | 93 +++++++++++++++++++
 .../algorithms/reduce_deterministic.hpp       | 74 ++++++++-------
 .../unit/algorithms/reduce_deterministic.cpp  | 44 +++++++++
 3 files changed, 178 insertions(+), 33 deletions(-)

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index 76f093837def..b2de030eed8b 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -11,6 +11,7 @@
 #include <hpx/functional/invoke.hpp>
 #include <hpx/parallel/algorithms/detail/rfa.hpp>
 #include <hpx/parallel/util/loop.hpp>
+#include <hpx/type_support/pack.hpp>
 
 #include <cstddef>
 #include <limits>
@@ -62,6 +63,84 @@ namespace hpx::parallel::detail {
         }
     };
 
+    template <typename ExPolicy>
+    struct sequential_reduce_deterministic_rfa_t final
+      : hpx::functional::detail::tag_fallback<
+            sequential_reduce_deterministic_rfa_t<ExPolicy>>
+    {
+    private:
+        template <typename InIterB, typename InIterE, typename T,
+            typename Reduce>
+        friend constexpr hpx::parallel::detail::rfa::
+            ReproducibleFloatingAccumulator<T>
+            tag_fallback_invoke(sequential_reduce_deterministic_rfa_t,
+                ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r)
+        {
+            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+
+            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+
+            for (auto e = first; e != last; ++e)
+            {
+                rfa += *e;
+            }
+            return rfa;
+        }
+
+        template <typename InIterB, typename T, typename Reduce>
+        friend constexpr hpx::parallel::detail::rfa::
+            ReproducibleFloatingAccumulator<T>
+            tag_fallback_invoke(sequential_reduce_deterministic_rfa_t,
+                ExPolicy&&, InIterB first, std::size_t size, T init, Reduce&& r)
+        {
+            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+
+            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+            auto e = first;
+            for (std::size_t i = 0; i < size; ++i, ++e)
+            {
+                rfa += *e;
+            }
+            return rfa;
+        }
+
+        // template <typename InIterB, typename InIterE, typename T,
+        //     typename Reduce
+        //     // typename = std::enable_if_t<hpx::util::contains<T,
+        //     //     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+        //     //         float>,
+        //     //     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+        //     //         double>>::value>
+        //     >
+        // friend constexpr T tag_fallback_invoke(
+        //     sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first,
+        //     InIterE last, T init, Reduce&& r)
+        // {
+        //     static_assert(hpx::util::contains<T,
+        //         hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+        //             float>,
+        //         hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+        //             double>>::value);
+        //     hpx::parallel::detail::rfa::RFA_bins<T> bins;
+        //     bins.initialize_bins();
+        //     std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+
+        //     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+        //     rfa.set_max_abs_val(init);
+        //     rfa.unsafe_add(init);
+        //     rfa.renorm();
+        //     for (auto e = first; e != last; ++e)
+        //     {
+        //         rfa += *e;
+        //     }
+        //     return rfa.conv();
+        // }
+    };
+
 #if !defined(HPX_COMPUTE_DEVICE_CODE)
     template <typename ExPolicy>
     inline constexpr sequential_reduce_deterministic_t<ExPolicy>
@@ -77,4 +156,18 @@ namespace hpx::parallel::detail {
     }
 #endif
 
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+    template <typename ExPolicy>
+    inline constexpr sequential_reduce_deterministic_rfa_t<ExPolicy>
+        sequential_reduce_deterministic_rfa =
+            sequential_reduce_deterministic_rfa_t<ExPolicy>{};
+#else
+    template <typename ExPolicy, typename... Args>
+    HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic_rfa(
+        Args&&... args)
+    {
+        return sequential_reduce_deterministic_rfa_t<ExPolicy>{}(
+            std::forward<Args>(args)...);
+    }
+#endif
 }    // namespace hpx::parallel::detail
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
index 56c8d3349898..b8435eec9a02 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include "detail/reduce_deterministic.hpp"
 #if defined(DOXYGEN)
 
 namespace hpx {
@@ -396,42 +397,49 @@ namespace hpx::parallel {
             static constexpr T sequential(ExPolicy&& policy, InIterB first,
                 InIterE last, T_&& init, Reduce&& r)
             {
-                // hpx::parallel::detail::sequential_reduce_deterministic_t<ExPolicy> seq;
-                // return seq(policy,first,last,0.0f,r);
-                return hpx::parallel::detail::sequential_reduce_deterministic<ExPolicy>(
-                    HPX_FORWARD(ExPolicy, policy), first, last,
+                return hpx::parallel::detail::sequential_reduce_deterministic<
+                    ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last,
                     HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r));
             }
 
-            //     template <typename ExPolicy, typename FwdIterB, typename FwdIterE,
-            //         typename T_, typename Reduce>
-            //     static util::detail::algorithm_result_t<ExPolicy, T> parallel(
-            //         ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init,
-            //         Reduce&& r)
-            //     {
-            //         if (first == last)
-            //         {
-            //             return util::detail::algorithm_result<ExPolicy, T>::get(
-            //                 HPX_FORWARD(T_, init));
-            //         }
-
-            //         auto f1 = [r](FwdIterB part_begin, std::size_t part_size) -> T {
-            //             T val = *part_begin;
-            //             return detail::sequential_reduce<ExPolicy>(
-            //                 ++part_begin, --part_size, HPX_MOVE(val), r);
-            //         };
-
-            //         return util::partitioner<ExPolicy, T>::call(
-            //             HPX_FORWARD(ExPolicy, policy), first,
-            //             detail::distance(first, last), HPX_MOVE(f1),
-            //             hpx::unwrapping(
-            //                 [init = HPX_FORWARD(T_, init),
-            //                     r = HPX_FORWARD(Reduce, r)](auto&& results) -> T {
-            //                     return detail::sequential_reduce<ExPolicy>(
-            //                         hpx::util::begin(results),
-            //                         hpx::util::size(results), init, r);
-            //                 }));
-            //     }
+            template <typename ExPolicy, typename FwdIterB, typename FwdIterE,
+                typename T_, typename Reduce>
+            static util::detail::algorithm_result_t<ExPolicy, T> parallel(
+                ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init,
+                Reduce&& r)
+            {
+                if (first == last)
+                {
+                    return util::detail::algorithm_result<ExPolicy, T>::get(
+                        HPX_FORWARD(T_, init));
+                }
+
+                auto f1 = [r, policy](
+                              FwdIterB part_begin, std::size_t part_size)
+                    -> hpx::parallel::detail::rfa::
+                        ReproducibleFloatingAccumulator<T_> {
+                            T val = *part_begin;
+                            return hpx::parallel::detail::
+                                sequential_reduce_deterministic_rfa<ExPolicy>(
+                                    HPX_FORWARD(ExPolicy, policy), ++part_begin,
+                                    --part_size, HPX_MOVE(val), r);
+                        };
+
+                return util::partitioner<ExPolicy, T,
+                    hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+                        T_>>::call(HPX_FORWARD(ExPolicy, policy), first,
+                    detail::distance(first, last), HPX_MOVE(f1),
+                    hpx::unwrapping([init = HPX_FORWARD(T_, init),
+                                        r = HPX_FORWARD(Reduce, r),
+                                        policy](auto&& results) -> T {
+                        return hpx::parallel::detail::
+                            sequential_reduce_deterministic_rfa<ExPolicy>(
+                                HPX_FORWARD(ExPolicy, policy),
+                                hpx::util::begin(results),
+                                hpx::util::size(results), init, r)
+                                .conv();
+                    }));
+            }
         };
         /// \endcond
     }    // namespace detail
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 0a67dea03026..694b50cfb76d 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -79,6 +79,49 @@ void test_reduce1(IteratorTag)
     HPX_TEST_EQ(r2, r3);
 }
 
+template <typename IteratorTag, typename FloatTypeDeterministic,
+    typename FloatTypeNonDeterministic, size_t LEN = 10007>
+void test_reduce_parallel1(IteratorTag)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result i.e. correct computation
+    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
+
+    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
+
+    std::vector<FloatTypeDeterministic> deterministic(LEN);
+    std::vector<FloatTypeNonDeterministic> nondeterministic(LEN);
+
+    std::iota(
+        deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0));
+
+    std::iota(nondeterministic.begin(), nondeterministic.end(),
+        FloatTypeNonDeterministic(0));
+
+    FloatTypeDeterministic val_det(0);
+    FloatTypeNonDeterministic val_non_det(0);
+    auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeDeterministic r1 = hpx::reduce_deterministic(hpx::execution::par,
+        iterator_det(std::begin(deterministic)),
+        iterator_det(std::end(deterministic)), val_det, op);
+
+    // verify values
+    // FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::par,
+    //     iterator_ndet(std::begin(nondeterministic)),
+    //     iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+
+    FloatTypeNonDeterministic r3 = std::accumulate(
+        nondeterministic.begin(), nondeterministic.end(), val_non_det);
+
+    HPX_TEST_EQ(r1, r3);
+    // HPX_TEST_EQ(r2, r3);
+}
+
 template <typename IteratorTag, typename FloatTypeDeterministic,
     size_t LEN = 10007>
 void test_reduce_determinism(IteratorTag)
@@ -178,6 +221,7 @@ void test_reduce1()
     test_reduce1<IteratorTag, double, float, 1000>(IteratorTag());
     test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
     test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
+    test_reduce_parallel1<IteratorTag, float, float, 1000>(IteratorTag());
 }
 
 template <typename IteratorTag>

From 520f1613edd947bdb30e3702d39c7d9bb6f96d29 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Mon, 16 Dec 2024 19:24:47 -0600
Subject: [PATCH 7/9] Add reproducible summation reduction

- Currently supports sequential version of reduction
- Tests determinism by shuffling order of floating point numbers

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           |   80 ++
 .../hpx/parallel/algorithms/detail/rfa.hpp    | 1145 +++++++++++++++++
 .../algorithms/reduce_deterministic.hpp       |  537 ++++++++
 .../tests/unit/algorithms/CMakeLists.txt      |    1 +
 .../unit/algorithms/reduce_deterministic.cpp  |  272 ++++
 5 files changed, 2035 insertions(+)
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
 create mode 100644 libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
new file mode 100644
index 000000000000..87069858f492
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -0,0 +1,80 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <hpx/config.hpp>
+#include <hpx/functional/detail/tag_fallback_invoke.hpp>
+#include <hpx/functional/invoke.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
+#include <hpx/parallel/util/loop.hpp>
+
+#include <cstddef>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include "rfa.hpp"
+
+namespace hpx::parallel::detail {
+
+    template <typename ExPolicy>
+    struct sequential_reduce_deterministic_t final
+      : hpx::functional::detail::tag_fallback<
+            sequential_reduce_deterministic_t<ExPolicy>>
+    {
+    private:
+        template <typename InIterB, typename InIterE, typename T,
+            typename Reduce>
+        friend constexpr T tag_fallback_invoke(
+            sequential_reduce_deterministic_t, ExPolicy&&, InIterB first,
+            InIterE last, T init, Reduce&& r)
+        {
+            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+
+            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+            rfa.set_max_abs_val(init);
+            rfa.unsafe_add(init);
+            rfa.renorm();
+            size_t count = 0;
+            T max_val = std::abs(*first);
+            for (auto e = first; e != last; ++e)
+            {
+                T temp_max_val = std::abs(static_cast<T>(*e));
+                if (max_val < temp_max_val)
+                {
+                    rfa.set_max_abs_val(temp_max_val);
+                    max_val = temp_max_val;
+                }
+                rfa.unsafe_add(*e);
+                count++;
+                if (count == rfa.endurance())
+                {
+                    rfa.renorm();
+                    count = 0;
+                }
+            }
+            return rfa.conv();
+        }
+    };
+
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+    template <typename ExPolicy>
+    inline constexpr sequential_reduce_deterministic_t<ExPolicy>
+        sequential_reduce_deterministic =
+            sequential_reduce_deterministic_t<ExPolicy>{};
+#else
+    template <typename ExPolicy, typename... Args>
+    HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic(
+        Args&&... args)
+    {
+        return sequential_reduce_deterministic_t<ExPolicy>{}(
+            std::forward<Args>(args)...);
+    }
+#endif
+
+}    // namespace hpx::parallel::detail
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
new file mode 100644
index 000000000000..302f823fab71
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -0,0 +1,1145 @@
+//Reproducible Floating Point Accumulations via Binned Floating Point
+//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
+//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
+//
+//The code accomplishes several objectives:
+//
+//1. Reproducible summation, independent of summation order, assuming only a
+//   subset of the IEEE 754 Floating Point Standard
+//
+//2. Has accuracy at least as good as conventional summation, and tunable
+//
+//3. Handles overflow, underflow, and other exceptions reproducibly.
+//
+//4. Makes only one read-only pass over the summands.
+//
+//5. Requires only one parallel reduction.
+//
+//6. Uses minimal memory (6 doubles per accumulator with fold=3).
+//
+//7. Relatively easy to use
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+namespace hpx::parallel::detail::rfa {
+    template <typename F>
+    struct type4
+    {
+        F x;
+        F y;
+        F z;
+        F w;
+    };
+
+    template <typename F>
+    struct type2
+    {
+        F x;
+        F y;
+    };
+    using float4 = type4<float>;
+    using double4 = type4<double>;
+    using float2 = type2<float>;
+    using double2 = type2<double>;
+
+    auto abs_max(float4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<float> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<double> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(float2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<float> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<double> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+// disable zero checks
+#define DISABLE_ZERO
+
+// disable nan / infinity checks
+#define DISABLE_NANINF
+
+// jump table for indexing into data
+#define MAX_JUMP 5
+    static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max");
+
+    template <typename Real>
+    inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
+    {
+        return std::ldexp(arg, exp);
+        // while (arg == 0)
+        // {
+        //     return arg;
+        // }
+        // while (exp > 0)
+        // {
+        //     arg *= static_cast<Real>(2);
+        //     --exp;
+        // }
+        // while (exp < 0)
+        // {
+        //     arg /= static_cast<Real>(2);
+        //     ++exp;
+        // }
+
+        // return arg;
+    }
+
+    template <class ftype>
+    struct RFA_bins
+    {
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+
+        ///The binned floating-point reference bins
+        std::array<ftype, MAXINDEX + MAXFOLD> bins = {};
+
+        constexpr ftype& operator[](int d)
+        {
+            return bins[d];
+        }
+
+        void initialize_bins()
+        {
+            if constexpr (std::is_same_v<ftype, float>)
+            {
+                bins[0] = std::ldexp(0.75, MAX_EXP);
+            }
+            else
+            {
+                bins[0] = 2.0 * std::ldexp(0.75, MAX_EXP - 1);
+            }
+
+            for (int index = 1; index <= MAXINDEX; index++)
+            {
+                bins[index] = std::ldexp(0.75,
+                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
+            }
+            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
+            {
+                bins[index] = bins[index - 1];
+            }
+        }
+    };
+
+    static char __rfa_bin_host_buffer__[sizeof(RFA_bins<double>)];
+
+    ///Class to hold a reproducible summation of the numbers passed to it
+    ///
+    ///@param ftype Floating-point data type; either `float` or `double
+    ///@param FOLD  The fold; use 3 as a default unless you understand it.
+    template <class ftype_, int FOLD_ = 3,
+        typename std::enable_if_t<std::is_floating_point<ftype_>::value>* =
+            nullptr>
+    class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator
+    {
+    public:
+        using ftype = ftype_;
+        static constexpr int FOLD = FOLD_;
+
+    private:
+        std::array<ftype, 2 * FOLD> data = {0};
+
+        ///Floating-point precision bin width
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+        ///Binned floating-point compression factor
+        ///This factor is used to scale down inputs before deposition into the bin of
+        ///highest index
+        static constexpr auto COMPRESSION =
+            1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1));
+        ///Binned double precision expansion factor
+        ///This factor is used to scale up inputs after deposition into the bin of
+        ///highest index
+        static constexpr auto EXPANSION =
+            1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1));
+        static constexpr auto EXP_BIAS = MAX_EXP - 2;
+        static constexpr auto EPSILON = std::numeric_limits<ftype>::epsilon();
+        ///Binned floating-point deposit endurance
+        ///The number of deposits that can be performed before a renorm is necessary.
+        ///Applies also to binned complex double precision.
+        static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
+
+        ///Return a binned floating-point reference bin
+        inline const ftype* binned_bins(const int x) const
+        {
+            return &reinterpret_cast<RFA_bins<ftype>&>(
+                __rfa_bin_host_buffer__)[x];
+        }
+
+        ///Get the bit representation of a float
+        static inline uint32_t& get_bits(float& x)
+        {
+            return *reinterpret_cast<uint32_t*>(&x);
+        }
+        ///Get the bit representation of a double
+        static inline uint64_t& get_bits(double& x)
+        {
+            return *reinterpret_cast<uint64_t*>(&x);
+        }
+        ///Get the bit representation of a const float
+        static inline uint32_t get_bits(const float& x)
+        {
+            return *reinterpret_cast<const uint32_t*>(&x);
+        }
+        ///Get the bit representation of a const double
+        static inline uint64_t get_bits(const double& x)
+        {
+            return *reinterpret_cast<const uint64_t*>(&x);
+        }
+
+        ///Return primary vector value const ref
+        inline const ftype& primary(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[4];
+                default:
+                    return data[FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[i];
+            }
+        }
+
+        ///Return carry vector value const ref
+        inline const ftype& carry(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[FOLD + 0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[FOLD + 1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[FOLD + 2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[FOLD + 3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[FOLD + 4];
+                default:
+                    return data[2 * FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[FOLD + i];
+            }
+        }
+
+        ///Return primary vector value ref
+        inline ftype& primary(int i)
+        {
+            const auto& c = *this;
+            return const_cast<ftype&>(c.primary(i));
+        }
+
+        ///Return carry vector value ref
+        inline ftype& carry(int i)
+        {
+            const auto& c = *this;
+            return const_cast<ftype&>(c.carry(i));
+        }
+
+#ifdef DISABLE_ZERO
+        static inline constexpr bool ISZERO(const ftype)
+        {
+            return false;
+        }
+#else
+        static inline constexpr bool ISZERO(const ftype x)
+        {
+            return x == 0.0;
+        }
+#endif
+
+#ifdef DISABLE_NANINF
+        static inline constexpr int ISNANINF(const ftype)
+        {
+            return false;
+        }
+#else
+        static inline constexpr int ISNANINF(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
+                ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
+        }
+#endif
+
+        static inline constexpr int EXP(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1);
+        }
+
+        ///Get index of float-point precision
+        ///The index of a non-binned type is the smallest index a binned type would
+        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
+        ///bins.
+        static inline constexpr int binned_dindex(const ftype x)
+        {
+            int exp = EXP(x);
+            if (exp == 0)
+            {
+                if (x == 0.0)
+                {
+                    return MAXINDEX;
+                }
+                else
+                {
+                    std::frexp(x, &exp);
+                    return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                }
+            }
+            return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
+        }
+
+        ///Get index of manually specified binned double precision
+        ///The index of a binned type is the bin that it corresponds to. Higher
+        ///indicies correspond to smaller bins.
+        inline int binned_index() const
+        {
+            return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
+                       EXP(primary(0))) /
+                BIN_WIDTH;
+        }
+
+        ///Check if index of manually specified binned floating-point is 0
+        ///A quick check to determine if the index is 0
+        inline bool binned_index0() const
+        {
+            return EXP(primary(0)) == MAX_EXP + EXP_BIAS;
+        }
+
+        ///Update manually specified binned fp with a scalar (X -> Y)
+        ///
+        ///This method updates the binned fp to an index suitable for adding numbers
+        ///with absolute value less than @p max_abs_val
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdupdate(
+            const ftype max_abs_val, const int incpriY, const int inccarY)
+        {
+            if (ISNANINF(primary(0)))
+                return;
+
+            int X_index = binned_dindex(max_abs_val);
+            if (ISZERO(primary(0)))
+            {
+                const ftype* const bins = binned_bins(X_index);
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) = bins[i];
+                    carry(i * inccarY) = 0.0;
+                }
+            }
+            else
+            {
+                int shift = binned_index() - X_index;
+                if (shift > 0)
+                {
+#pragma unroll
+                    for (int i = FOLD - 1; i >= 1; i--)
+                    {
+                        if (i < shift)
+                            break;
+                        primary(i * incpriY) = primary((i - shift) * incpriY);
+                        carry(i * inccarY) = carry((i - shift) * inccarY);
+                    }
+                    const ftype* const bins = binned_bins(X_index);
+#pragma unroll
+                    for (int j = 0; j < FOLD; j++)
+                    {
+                        if (j >= shift)
+                            break;
+                        primary(j * incpriY) = bins[j];
+                        carry(j * inccarY) = 0.0;
+                    }
+                }
+            }
+        }
+
+        ///Add scalar @p X to suitably binned manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y where the index of Y is
+        ///larger than the index of @p X
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        void binned_dmddeposit(const ftype X, const int incpriY)
+        {
+            ftype M;
+            ftype x = X;
+
+            if (ISNANINF(x) || ISNANINF(primary(0)))
+            {
+                primary(0) += x;
+                return;
+            }
+
+            if (binned_index0())
+            {
+                M = primary(0);
+                ftype qd = x * COMPRESSION;
+                auto& ql = get_bits(qd);
+                ql |= 1;
+                qd += M;
+                primary(0) = qd;
+                M -= qd;
+                M *= EXPANSION * 0.5;
+                x += M;
+                x += M;
+#pragma unroll
+                for (int i = 1; i < FOLD - 1; i++)
+                {
+                    M = primary(i * incpriY);
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    primary(i * incpriY) = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                primary((FOLD - 1) * incpriY) += qd;
+            }
+            else
+            {
+                ftype qd = x;
+                auto& ql = get_bits(qd);
+#pragma unroll
+                for (int i = 0; i < FOLD - 1; i++)
+                {
+                    M = primary(i * incpriY);
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    primary(i * incpriY) = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                primary((FOLD - 1) * incpriY) += qd;
+            }
+        }
+
+        ///Renormalize manually specified binned double precision
+        ///
+        ///Renormalization keeps the primary vector within the necessary bins by
+        ///shifting over to the carry vector
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        inline void binned_dmrenorm(const int incpriX, const int inccarX)
+        {
+            if (ISZERO(primary(0)) || ISNANINF(primary(0)))
+                return;
+
+            for (int i = 0; i < FOLD; i++)
+            {
+                auto tmp_renormd = primary(i * incpriX);
+                auto& tmp_renorml = get_bits(tmp_renormd);
+
+                carry(i * inccarX) +=
+                    (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
+
+                tmp_renorml &= ~(1ull << (MANT_DIG - 3));
+                tmp_renorml |= 1ull << (MANT_DIG - 2);
+                primary(i * incpriX) = tmp_renormd;
+            }
+        }
+
+        ///Add scalar to manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdadd(const ftype X, const int incpriY, const int inccarY)
+        {
+            binned_dmdupdate(X, incpriY, inccarY);
+            binned_dmddeposit(X, incpriY);
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Convert manually specified binned fp to native double-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        double binned_conv_double(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
+                return 0.0;
+
+            double Y = 0.0;
+            double scale_down;
+            double scale_up;
+            int scaled;
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
+            {
+                scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
+                scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
+                scaled = std::max(
+                    std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
+                if (X_index == 0)
+                {
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
+                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
+                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
+                    i = 2;
+                }
+                else
+                {
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
+                    i = 1;
+                }
+                for (; i < scaled; i++)
+                {
+                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
+                    Y +=
+                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
+                }
+                if (i == FOLD)
+                {
+                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
+                        scale_down;
+                    return Y * scale_up;
+                }
+                if (std::isinf(Y * scale_up))
+                {
+                    return Y * scale_up;
+                }
+                Y *= scale_up;
+                for (; i < FOLD; i++)
+                {
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += primary((i - 1) * incpriX) - bins[i - 1];
+                }
+                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
+            }
+            else
+            {
+                Y += carry(0) * (bins[0] / 6.0);
+                for (i = 1; i < FOLD; i++)
+                {
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
+                }
+                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+            }
+            return Y;
+        }
+
+        ///Convert manually specified binned fp to native single-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        float binned_conv_single(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+            double Y = 0.0;
+
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
+                return 0.0;
+
+            //Note that the following order of summation is in order of decreasing
+            //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
+            //the number of carries equal to 1.
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index == 0)
+            {
+                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
+                    (double) EXPANSION;
+                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
+                Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
+                i = 2;
+            }
+            else
+            {
+                Y += (double) carry(0) * (double) (bins[0] / 6.0);
+                i = 1;
+            }
+            for (; i < FOLD; i++)
+            {
+                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
+                Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
+            }
+            Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+
+            return (float) Y;
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        ///
+        ///@param other   Another binned fp of the same type
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdmadd(const ReproducibleFloatingAccumulator& x,
+            const int incpriX, const int inccarX, const int incpriY,
+            const int inccarY)
+        {
+            if (ISZERO(x.primary(0)))
+                return;
+
+            if (ISZERO(primary(0)))
+            {
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
+                }
+                return;
+            }
+
+            if (ISNANINF(x.primary(0)) || ISNANINF(primary(0)))
+            {
+                primary(0) += x.primary(0);
+                return;
+            }
+
+            const auto X_index = x.binned_index();
+            const auto Y_index = this->binned_index();
+            const auto shift = Y_index - X_index;
+            if (shift > 0)
+            {
+                const auto* const bins = binned_bins(Y_index);
+                //shift Y upwards and add X to Y
+#pragma unroll
+                for (int i = FOLD - 1; i >= 1; i--)
+                {
+                    if (i < shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX) +
+                        (primary((i - shift) * incpriY) - bins[i - shift]);
+                    carry(i * inccarY) =
+                        x.carry(i * inccarX) + carry((i - shift) * inccarY);
+                }
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    if (i == shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
+                }
+            }
+            else if (shift < 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+                //shift X upwards and add X to Y
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    if (i < -shift)
+                        continue;
+                    primary(i * incpriY) +=
+                        x.primary((i + shift) * incpriX) - bins[i + shift];
+                    carry(i * inccarY) += x.carry((i + shift) * inccarX);
+                }
+            }
+            else if (shift == 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+                // add X to Y
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
+                    carry(i * inccarY) += x.carry(i * inccarX);
+                }
+            }
+
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        void binned_dbdbadd(const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dmdmadd(other, 1, 1, 1, 1);
+        }
+
+    public:
+        ReproducibleFloatingAccumulator() = default;
+        ReproducibleFloatingAccumulator(
+            const ReproducibleFloatingAccumulator&) = default;
+        ///Sets this binned fp equal to another binned fp
+        ReproducibleFloatingAccumulator& operator=(
+            const ReproducibleFloatingAccumulator&) = default;
+
+        ///Set the binned fp to zero
+        void zero()
+        {
+            data = {0};
+        }
+
+        ///Return the fold of the binned fp
+        constexpr int fold() const
+        {
+            return FOLD;
+        }
+
+        ///Return the endurance of the binned fp
+        constexpr int endurance() const
+        {
+            return ENDURANCE;
+        }
+
+        ///Returns the number of reference bins. Used for judging memory usage.
+        constexpr size_t number_of_reference_bins()
+        {
+            return std::array<ftype, MAXINDEX + MAXFOLD>::size();
+        }
+
+        ///Accumulate an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator+=(const U x)
+        {
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate-subtract an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator-=(const U x)
+        {
+            binned_dmdadd(-static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate a binned fp @p x into the binned fp.
+        ReproducibleFloatingAccumulator& operator+=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dbdbadd(other);
+            return *this;
+        }
+
+        ///Accumulate-subtract a binned fp @p x into the binned fp.
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator& operator-=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            const auto temp = -other;
+            binned_dbdbadd(temp);
+        }
+
+        ///Determines if two binned fp are equal
+        bool operator==(const ReproducibleFloatingAccumulator& other) const
+        {
+            return data == other.data;
+        }
+
+        ///Determines if two binned fp are not equal
+        bool operator!=(const ReproducibleFloatingAccumulator& other) const
+        {
+            return !operator==(other);
+        }
+
+        ///Sets this binned fp equal to the arithmetic value @p x
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator=(const U x)
+        {
+            zero();
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Returns the negative of this binned fp
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator operator-()
+        {
+            constexpr int incpriX = 1;
+            constexpr int inccarX = 1;
+            ReproducibleFloatingAccumulator temp = *this;
+            if (primary(0) != 0.0)
+            {
+                const auto* const bins = binned_bins(binned_index());
+                for (int i = 0; i < FOLD; i++)
+                {
+                    temp.primary(i * incpriX) =
+                        bins[i] - (primary(i * incpriX) - bins[i]);
+                    temp.carry(i * inccarX) = -carry(i * inccarX);
+                }
+            }
+            return temp;
+        }
+
+        ///Convert this binned fp into its native floating-point representation
+        ftype conv() const
+        {
+            if (std::is_same_v<ftype, float>)
+            {
+                return binned_conv_single(1, 1);
+            }
+            else
+            {
+                return binned_conv_double(1, 1);
+            }
+        }
+
+        ///@brief Get binned fp summation error bound
+        ///
+        ///This is a bound on the absolute error of a summation using binned types
+        ///
+        ///@param N           The number of single precision floating point summands
+        ///@param max_abs_val The summand of maximum absolute value
+        ///@param binned_sum  The value of the sum computed using binned types
+        ///@return            The absolute error bound
+        static constexpr ftype error_bound(
+            const uint64_t N, const ftype max_abs_val, const ftype binned_sum)
+        {
+            const double X = std::abs(max_abs_val);
+            const double S = std::abs(binned_sum);
+            return static_cast<ftype>(max(X, std::ldexp(0.5, MIN_EXP - 1)) *
+                    std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
+                ((7.0 * EPSILON) /
+                    (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
+                        7.0 * EPSILON)) *
+                    S);
+        }
+
+        ///Add @p x to the binned fp
+        void add(const ftype x)
+        {
+            binned_dmdadd(x, 1, 1);
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last, const ftype max_abs_val)
+        {
+            binned_dmdupdate(std::abs(max_abs_val), 1, 1);
+            size_t count = 0;
+            size_t N = last - first;
+            for (; first != last; first++, count++)
+            {
+                binned_dmddeposit(static_cast<ftype>(*first), 1);
+                // first conditional allows compiler to remove the call here when possible
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last)
+        {
+            const auto max_abs_val = *std::max_element(
+                first, last, [](const auto& a, const auto& b) {
+                    return std::abs(a) < std::abs(b);
+                });
+            add(first, last, static_cast<ftype>(max_abs_val));
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void add(const T* input, const size_t N, const ftype max_abs_val)
+        {
+            if (N == 0)
+                return;
+            add(input, input + N, max_abs_val);
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void add(const T* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            T max_abs_val = input[0];
+            for (size_t i = 0; i < N; i++)
+            {
+                max_abs_val = max(max_abs_val, std::abs(input[i]));
+            }
+            add(input, N, max_abs_val);
+        }
+
+        ///Accumulate a float4 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float4& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            binned_dmddeposit(static_cast<ftype>(x.z), 1);
+            binned_dmddeposit(static_cast<ftype>(x.w), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const double2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        void add(const float4* input, const size_t N, float max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].z), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].w), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const double2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const float2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const float4* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const double2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const float2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        //////////////////////////////////////
+        //MANUAL OPERATIONS; USE WISELY
+        //////////////////////////////////////
+
+        ///Rebins for repeated accumulation of scalars with magnitude <= @p mav
+        ///
+        ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator
+        ///with `unsafe_add` after which `renorm()` must be called. See the source of
+        ///`add()` for an example
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void set_max_abs_val(const T mav)
+        {
+            binned_dmdupdate(std::abs(mav), 1, 1);
+        }
+
+        ///Add @p x to the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()`
+        void unsafe_add(const ftype x)
+        {
+            binned_dmddeposit(x, 1);
+        }
+
+        ///Renormalizes the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()` and one or
+        ///more calls to `unsafe_add()`
+        void renorm()
+        {
+            binned_dmrenorm(1, 1);
+        }
+    };
+
+}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
new file mode 100644
index 000000000000..9920d3bd7a6e
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -0,0 +1,537 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+/// \file parallel/algorithms/reduce_deterministic.hpp
+/// \page hpx::reduce_deterministic
+/// \headerfile hpx/algorithm.hpp
+
+#pragma once
+
+#if defined(DOXYGEN)
+
+namespace hpx {
+
+    // clang-format off
+
+    /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         predicate \a f.
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    /// \tparam F           The type of the function/function object to use
+    ///                     (deduced). Unlike its sequential form, the parallel
+    ///                     overload of \a reduce requires \a F to meet the
+    ///                     requirements of \a CopyConstructible.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    /// \param f            Specifies the function (or function object) which
+    ///                     will be invoked for each of the elements in the
+    ///                     sequence specified by [first, last). This is a
+    ///                     binary predicate. The signature of this predicate
+    ///                     should be equivalent to:
+    ///                     \code
+    ///                     Ret fun(const Type1 &a, const Type1 &b);
+    ///                     \endcode \n
+    ///                     The signature does not need to have const&.
+    ///                     The types \a Type1 \a Ret must be
+    ///                     such that an object of type \a FwdIter can be
+    ///                     dereferenced and then implicitly converted to any
+    ///                     of those types.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a copy_if algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns \a T otherwise.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum over the elements given by the input range
+    ///           [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(op, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter, typename F,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init, F&& f);
+
+    /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a copy_if algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns \a T otherwise.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    util::detail::algorithm_result_t<ExPolicy, T>
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init);
+
+    /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns T otherwise (where T is the value_type of
+    ///           \a FwdIter).
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   The type of the initial value (and the result type) \a T is
+    ///         determined from the value_type of the used \a FwdIter.
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter>
+    typename hpx::parallel::util::detail::algorithm_result<ExPolicy,
+        typename std::iterator_traits<FwdIter>::value_type
+    >::type
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last);
+
+    /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         predicate \a f.
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    /// \tparam F           The type of the function/function object to use
+    ///                     (deduced). Unlike its sequential form, the parallel
+    ///                     overload of \a reduce requires \a F to meet the
+    ///                     requirements of \a CopyConstructible.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    /// \param f            Specifies the function (or function object) which
+    ///                     will be invoked for each of the elements in the
+    ///                     sequence specified by [first, last). This is a
+    ///                     binary predicate. The signature of this predicate
+    ///                     should be equivalent to:
+    ///                     \code
+    ///                     Ret fun(const Type1 &a, const Type1 &b);
+    ///                     \endcode \n
+    ///                     The signature does not need to have const&.
+    ///                     The types \a Type1 \a Ret must be
+    ///                     such that an object of type \a InIter can be
+    ///                     dereferenced and then implicitly converted to any
+    ///                     of those types.
+    ///
+    /// \returns  The \a reduce algorithm returns \a T.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum over the elements given by the input range
+    ///           [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(op, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter, typename F,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    T reduce_deterministic(FwdIter first, FwdIter last, T init, F&& f);
+
+    /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a T.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    T reduce_deterministic(FwdIter first, FwdIter last, T init);
+
+    /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    ///
+    /// \returns  The \a reduce algorithm returns \a T (where T is the
+    ///           value_type of \a FwdIter).
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   The type of the initial value (and the result type) \a T is
+    ///         determined from the value_type of the used \a FwdIter.
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter>
+    typename std::iterator_traits<FwdIter>::value_type
+    reduce_deterministic(FwdIter first, FwdIter last);
+    // clang-format on
+}    // namespace hpx
+
+#else    // DOXYGEN
+
+#include <hpx/config.hpp>
+#include <hpx/concepts/concepts.hpp>
+#include <hpx/executors/execution_policy.hpp>
+#include <hpx/iterator_support/range.hpp>
+#include <hpx/pack_traversal/unwrap.hpp>
+#include <hpx/parallel/algorithms/detail/accumulate.hpp>
+#include <hpx/parallel/algorithms/detail/dispatch.hpp>
+#include <hpx/parallel/algorithms/detail/distance.hpp>
+#include <hpx/parallel/algorithms/detail/reduce.hpp>
+#include <hpx/parallel/algorithms/detail/reduce_deterministic.hpp>
+#include <hpx/parallel/util/detail/algorithm_result.hpp>
+#include <hpx/parallel/util/detail/sender_util.hpp>
+#include <hpx/parallel/util/loop.hpp>
+#include <hpx/parallel/util/partitioner.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+namespace hpx::parallel {
+
+    ///////////////////////////////////////////////////////////////////////////
+    // reduce
+    namespace detail {
+
+        /// \cond NOINTERNAL
+        template <typename T>
+        struct reduce_deterministic
+          : public algorithm<reduce_deterministic<T>, T>
+        {
+            constexpr reduce_deterministic() noexcept
+              : algorithm<reduce_deterministic, T>("reduce_deterministic")
+            {
+            }
+
+            template <typename ExPolicy, typename InIterB, typename InIterE,
+                typename T_, typename Reduce>
+            static constexpr T sequential(ExPolicy&& policy, InIterB first,
+                InIterE last, T_&& init, Reduce&& r)
+            {
+                return hpx::parallel::detail::sequential_reduce_deterministic<
+                    ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last,
+                    HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r));
+            }
+        };
+        /// \endcond
+    }    // namespace detail
+}    // namespace hpx::parallel
+
+namespace hpx {
+
+    ///////////////////////////////////////////////////////////////////////////
+    // CPO for hpx::reduce
+    inline constexpr struct reduce_deterministic_t final
+      : hpx::detail::tag_parallel_algorithm<reduce_deterministic_t>
+    {
+    private:
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter, typename F,
+            typename T = typename std::iterator_traits<FwdIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last, T init, F f)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init),
+                HPX_MOVE(f));
+        }
+
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter,
+            typename T = typename std::iterator_traits<FwdIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last, T init)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init),
+                std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy,
+            typename std::iterator_traits<FwdIter>::value_type>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            using value_type =
+                typename std::iterator_traits<FwdIter>::value_type;
+
+            return hpx::parallel::detail::reduce_deterministic<value_type>()
+                .call(HPX_FORWARD(ExPolicy, policy), first, last, value_type{},
+                    std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename InIter, typename F,
+            typename T = typename std::iterator_traits<InIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend T tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last, T init, F f)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                hpx::execution::seq, first, last, HPX_MOVE(init), HPX_MOVE(f));
+        }
+
+        // clang-format off
+        template <typename InIter,
+            typename T = typename std::iterator_traits<InIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend T tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last, T init)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                hpx::execution::seq, first, last, HPX_MOVE(init),
+                std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename InIter,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend typename std::iterator_traits<InIter>::value_type
+        tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            using value_type =
+                typename std::iterator_traits<InIter>::value_type;
+
+            return hpx::parallel::detail::reduce_deterministic<value_type>()
+                .call(hpx::execution::seq, first, last, value_type{},
+                    std::plus<>());
+        }
+    } reduce_deterministic{};
+}    // namespace hpx
+
+#endif    // DOXYGEN
diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
index 7d3a9204530c..559ee830030e 100644
--- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
@@ -94,6 +94,7 @@ set(tests
     partition_copy
     reduce_
     reduce_by_key
+    reduce_deterministic
     remove
     remove1
     remove2
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
new file mode 100644
index 000000000000..92dd2e7f3dc2
--- /dev/null
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -0,0 +1,272 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <hpx/init.hpp>
+#include <hpx/modules/testing.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
+#include <hpx/parallel/algorithms/reduce.hpp>
+#include <hpx/parallel/algorithms/reduce_deterministic.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include "test_utils.hpp"
+
+int seed = std::random_device{}();
+std::mt19937 gen(seed);
+
+template <typename T>
+T get_rand(
+    T LO = std::numeric_limits<T>::min(), T HI = std::numeric_limits<T>::max())
+{
+    return LO +
+        static_cast<T>(std::rand()) / (static_cast<T>(RAND_MAX / (HI - LO)));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename IteratorTag, typename FloatTypeDeterministic,
+    typename FloatTypeNonDeterministic, size_t LEN = 10007>
+void test_reduce1(IteratorTag)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result i.e. correct computation
+    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
+
+    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
+
+    std::vector<FloatTypeDeterministic> deterministic(LEN);
+    std::vector<FloatTypeNonDeterministic> nondeterministic(LEN);
+
+    std::iota(
+        deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0));
+
+    std::iota(nondeterministic.begin(), nondeterministic.end(),
+        FloatTypeNonDeterministic(0));
+
+    FloatTypeDeterministic val_det(0);
+    FloatTypeNonDeterministic val_non_det(0);
+    auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeDeterministic r1 =
+        hpx::reduce_deterministic(iterator_det(std::begin(deterministic)),
+            iterator_det(std::end(deterministic)), val_det, op);
+
+    // verify values
+    FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic)),
+        iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+
+    FloatTypeNonDeterministic r3 = std::accumulate(
+        nondeterministic.begin(), nondeterministic.end(), val_non_det);
+
+    HPX_TEST_EQ(r1, r3);
+    HPX_TEST_EQ(r2, r3);
+}
+
+template <typename IteratorTag, typename FloatTypeDeterministic,
+    size_t LEN = 10007>
+void test_reduce_determinism(IteratorTag)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result
+    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
+
+    constexpr FloatTypeDeterministic num_bounds_det =
+        std::is_same_v<FloatTypeDeterministic, float> ? 1000.0 : 1000000.0;
+
+    std::vector<FloatTypeDeterministic> deterministic(LEN);
+
+    for (size_t i = 0; i < LEN; ++i)
+    {
+        deterministic[i] =
+            get_rand<FloatTypeDeterministic>(-num_bounds_det, num_bounds_det);
+    }
+
+    std::vector<FloatTypeDeterministic> deterministic_shuffled = deterministic;
+
+    std::shuffle(
+        deterministic_shuffled.begin(), deterministic_shuffled.end(), gen);
+
+    FloatTypeDeterministic val_det(41.999);
+
+    auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeDeterministic r1 =
+        hpx::reduce_deterministic(iterator_det(std::begin(deterministic)),
+            iterator_det(std::end(deterministic)), val_det, op);
+
+    FloatTypeDeterministic r1_shuffled = hpx::reduce_deterministic(
+        iterator_det(std::begin(deterministic_shuffled)),
+        iterator_det(std::end(deterministic_shuffled)), val_det, op);
+
+    HPX_TEST_EQ(r1,
+        r1_shuffled);    // Deterministically calculated, should always satisfy
+}
+
+/// This test function is never called because it is not guaranteed to pass
+/// It serves an important purpose to demonstrate that floating point summation
+/// is not always associative i.e. a+b+c != a+c+b
+template <typename IteratorTag, typename FloatTypeNonDeterministic,
+    size_t LEN = 10007>
+void test_orig_reduce_determinism(IteratorTag)
+{
+    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
+
+    constexpr auto num_bounds_ndet =
+        std::is_same_v<FloatTypeNonDeterministic, float> ? 1000.0f : 1000000.0f;
+
+    std::vector<FloatTypeNonDeterministic> nondeterministic(LEN);
+    for (size_t i = 0; i < LEN; ++i)
+    {
+        nondeterministic[i] = get_rand<FloatTypeNonDeterministic>(
+            -num_bounds_ndet, num_bounds_ndet);
+    }
+    std::vector<FloatTypeNonDeterministic> nondeterministic_shuffled =
+        nondeterministic;
+    std::shuffle(nondeterministic_shuffled.begin(),
+        nondeterministic_shuffled.end(), gen);
+
+    FloatTypeNonDeterministic val_non_det(41.999);
+
+    auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic)),
+        iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+    FloatTypeNonDeterministic r2_shuffled = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic_shuffled)),
+        iterator_ndet(std::end(nondeterministic_shuffled)), val_non_det, op);
+
+    FloatTypeNonDeterministic r3 = std::accumulate(
+        nondeterministic.begin(), nondeterministic.end(), val_non_det);
+    FloatTypeNonDeterministic r3_shuffled =
+        std::accumulate(nondeterministic_shuffled.begin(),
+            nondeterministic_shuffled.end(), val_non_det);
+
+    /// failed around 131 times out of 1000 on macOS arm
+    /// Floating point addition is not necessarily associative,
+    /// might fail on an architecture not yet known with much higher precision
+    HPX_TEST_NEQ(r2, r2_shuffled);
+    HPX_TEST_NEQ(r3, r3_shuffled);
+}
+
+template <typename IteratorTag>
+void test_reduce1()
+{
+    using namespace hpx::execution;
+
+    test_reduce1<IteratorTag, float, float, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, double, float, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
+}
+
+template <typename IteratorTag>
+void test_reduce2()
+{
+    using namespace hpx::execution;
+
+    test_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
+    test_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
+}
+
+// template <typename IteratorTag>
+// void test_reduce3()
+// {
+//     using namespace hpx::execution;
+
+//     test_orig_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
+//     test_orig_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
+// }
+
+void reduce_test1()
+{
+    test_reduce1<std::random_access_iterator_tag>();
+    test_reduce2<std::random_access_iterator_tag>();
+    // test_reduce3<std::random_access_iterator_tag>();
+    test_reduce1<std::forward_iterator_tag>();
+    test_reduce2<std::forward_iterator_tag>();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+int hpx_main(hpx::program_options::variables_map& vm)
+{
+    unsigned int seed = (unsigned int) std::time(nullptr);
+    bool seed_random = false;
+
+    if (vm.count("seed"))
+    {
+        seed = vm["seed"].as<unsigned int>();
+        seed_random = true;
+    }
+
+    if (vm.count("seed-random"))
+        seed_random = vm["seed-random"].as<bool>();
+
+    if (seed_random)
+    {
+        std::cout << "using seed: " << seed << std::endl;
+        std::cout << "** std::accumulate, hpx::reduce may fail due to "
+                     "non-determinism of the floating summation"
+                  << std::endl;
+        gen.seed(seed);
+        std::srand(seed);
+    }
+    else
+    {
+        gen.seed(223);
+        std::srand(223);
+    }
+
+    reduce_test1();
+
+    return hpx::local::finalize();
+}
+
+int main(int argc, char* argv[])
+{
+    // add command line option which controls the random number generator seed
+    using namespace hpx::program_options;
+    options_description desc_commandline(
+        "Usage: " HPX_APPLICATION_STRING " [options]");
+
+    desc_commandline.add_options()("seed,s", value<unsigned int>(),
+        "the random number generator seed to use for this run");
+    desc_commandline.add_options()("seed-random", value<bool>(),
+        "switch for the random number generator seed to use for this run");
+
+    // By default this test should run on all available cores
+    std::vector<std::string> const cfg = {"hpx.os_threads=all"};
+
+    // Initialize and run HPX
+    hpx::local::init_params init_args;
+    init_args.desc_cmdline = desc_commandline;
+    init_args.cfg = cfg;
+
+    HPX_TEST_EQ_MSG(hpx::local::init(hpx_main, argc, argv, init_args), 0,
+        "HPX main exited with non-zero status");
+
+    return hpx::util::report_errors();
+}

From 8b0ce4cd5e34598840e5f28851341de4d8d01213 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 17 Dec 2024 14:22:55 -0500
Subject: [PATCH 8/9] Address inspect tool, check module cmakelists, warnings
 and spell check

- missing includes
- prevent max/min being expanded as macros
- minor spell check correction
- remove pragma once in cpp file
- resolve implicit type conversions in rfa type to single and double and other places
- add dual license
- remove unnecessary command for macos ci
- use HPX_UNROLL instead of vanilla pragma
- clang-17 cannot unroll so use checks
- add typename qualifier for iterator type

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .github/workflows/macos_debug_fetch_hwloc.yml |   1 -
 libs/core/algorithms/CMakeLists.txt           |   3 +
 .../detail/reduce_deterministic.hpp           |   3 +
 .../hpx/parallel/algorithms/detail/rfa.hpp    | 159 ++++++++++++------
 .../unit/algorithms/reduce_deterministic.cpp  |  26 +--
 5 files changed, 133 insertions(+), 59 deletions(-)

diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml
index f778a6b117d7..caec2af3ac25 100644
--- a/.github/workflows/macos_debug_fetch_hwloc.yml
+++ b/.github/workflows/macos_debug_fetch_hwloc.yml
@@ -19,7 +19,6 @@ jobs:
       run: |
           brew install --overwrite python-tk && \
           brew install --overwrite boost gperftools ninja autoconf automake && \
-          autoreconf -f -i \
           brew upgrade cmake
     - name: Configure
       shell: bash
diff --git a/libs/core/algorithms/CMakeLists.txt b/libs/core/algorithms/CMakeLists.txt
index 6fcfed897e2f..9090345722df 100644
--- a/libs/core/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/CMakeLists.txt
@@ -37,7 +37,9 @@ set(algorithms_headers
     hpx/parallel/algorithms/detail/parallel_stable_sort.hpp
     hpx/parallel/algorithms/detail/pivot.hpp
     hpx/parallel/algorithms/detail/reduce.hpp
+    hpx/parallel/algorithms/detail/reduce_deterministic.hpp
     hpx/parallel/algorithms/detail/replace.hpp
+    hpx/parallel/algorithms/detail/rfa.hpp
     hpx/parallel/algorithms/detail/rotate.hpp
     hpx/parallel/algorithms/detail/sample_sort.hpp
     hpx/parallel/algorithms/detail/search.hpp
@@ -72,6 +74,7 @@ set(algorithms_headers
     hpx/parallel/algorithms/partition.hpp
     hpx/parallel/algorithms/reduce_by_key.hpp
     hpx/parallel/algorithms/reduce.hpp
+    hpx/parallel/algorithms/reduce_deterministic.hpp
     hpx/parallel/algorithms/remove_copy.hpp
     hpx/parallel/algorithms/remove.hpp
     hpx/parallel/algorithms/replace.hpp
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index 87069858f492..b37730889172 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -13,6 +13,7 @@
 #include <hpx/parallel/util/loop.hpp>
 
 #include <cstddef>
+#include <cstring>
 #include <limits>
 #include <type_traits>
 #include <utility>
@@ -32,6 +33,8 @@ namespace hpx::parallel::detail {
             sequential_reduce_deterministic_t, ExPolicy&&, InIterB first,
             InIterE last, T init, Reduce&& r)
         {
+            /// TODO: Put constraint on Reduce to be a binary plus operator
+            (void) r;
             hpx::parallel::detail::rfa::RFA_bins<T> bins;
             bins.initialize_bins();
             std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
index 302f823fab71..fa9142cdf80b 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -1,3 +1,34 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// ---------------------------------------------------------------------------
+// This file has been taken from
+// https://github.com/maddyscientist/reproducible_floating_sums commit
+// b5a065741d4ea459437ca004b508de9dcb6a3e52. The boost copyright has been added
+// to this file in accordance with the dual license terms for the Reproducible
+// Floating-Point Summations and conformance with the HPX policy
+// https://github.com/maddyscientist/reproducible_floating_sums/blob/feature/cuda/LICENSE.md
+// ---------------------------------------------------------------------------
+//
+/// Copyright 2022 Richard Barnes, Peter Ahrens, James Demmel
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to deal
+/// in the Software without restriction, including without limitation the rights
+/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+/// copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+/// The above copyright notice and this permission notice shall be included in
+/// all copies or substantial portions of the Software.
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
 //Reproducible Floating Point Accumulations via Binned Floating Point
 //Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
 //ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
@@ -26,6 +57,10 @@
 #include <cmath>
 #include <cstdint>
 #include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <hpx/config.hpp>
 
 namespace hpx::parallel::detail::rfa {
     template <typename F>
@@ -179,7 +214,7 @@ namespace hpx::parallel::detail::rfa {
         static constexpr int FOLD = FOLD_;
 
     private:
-        std::array<ftype, 2 * FOLD> data = {0};
+        std::array<ftype, 2 * FOLD> data = {{0}};
 
         ///Floating-point precision bin width
         static constexpr auto BIN_WIDTH =
@@ -351,21 +386,21 @@ namespace hpx::parallel::detail::rfa {
 
         ///Get index of float-point precision
         ///The index of a non-binned type is the smallest index a binned type would
-        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
+        ///need to have to sum it reproducibly. Higher indices correspond to smaller
         ///bins.
         static inline constexpr int binned_dindex(const ftype x)
         {
             int exp = EXP(x);
             if (exp == 0)
             {
-                if (x == 0.0)
+                if (x == static_cast<ftype>(0.0))
                 {
                     return MAXINDEX;
                 }
                 else
                 {
                     std::frexp(x, &exp);
-                    return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                    return (std::max)((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
                 }
             }
             return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
@@ -373,7 +408,7 @@ namespace hpx::parallel::detail::rfa {
 
         ///Get index of manually specified binned double precision
         ///The index of a binned type is the bin that it corresponds to. Higher
-        ///indicies correspond to smaller bins.
+        ///indices correspond to smaller bins.
         inline int binned_index() const
         {
             return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
@@ -416,7 +451,9 @@ namespace hpx::parallel::detail::rfa {
                 int shift = binned_index() - X_index;
                 if (shift > 0)
                 {
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                    HPX_UNROLL
+#endif
                     for (int i = FOLD - 1; i >= 1; i--)
                     {
                         if (i < shift)
@@ -425,7 +462,9 @@ namespace hpx::parallel::detail::rfa {
                         carry(i * inccarY) = carry((i - shift) * inccarY);
                     }
                     const ftype* const bins = binned_bins(X_index);
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                    HPX_UNROLL
+#endif
                     for (int j = 0; j < FOLD; j++)
                     {
                         if (j >= shift)
@@ -457,16 +496,19 @@ namespace hpx::parallel::detail::rfa {
             if (binned_index0())
             {
                 M = primary(0);
-                ftype qd = x * COMPRESSION;
+                ftype qd = x * static_cast<ftype>(COMPRESSION);
                 auto& ql = get_bits(qd);
                 ql |= 1;
                 qd += M;
                 primary(0) = qd;
                 M -= qd;
-                M *= EXPANSION * 0.5;
+                auto temp_m = (double) (((double) EXPANSION) * 0.5);
+                M *= static_cast<ftype>(temp_m);
                 x += M;
                 x += M;
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 1; i < FOLD - 1; i++)
                 {
                     M = primary(i * incpriY);
@@ -485,7 +527,9 @@ namespace hpx::parallel::detail::rfa {
             {
                 ftype qd = x;
                 auto& ql = get_bits(qd);
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD - 1; i++)
                 {
                     M = primary(i * incpriY);
@@ -550,7 +594,7 @@ namespace hpx::parallel::detail::rfa {
             int i = 0;
 
             if (ISNANINF(primary(0)))
-                return primary(0);
+                return (double) primary(0);
             if (ISZERO(primary(0)))
                 return 0.0;
 
@@ -564,29 +608,36 @@ namespace hpx::parallel::detail::rfa {
             {
                 scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
                 scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
-                scaled = std::max(
-                    std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
+                scaled = (std::max)(
+                    (std::min)(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
                 if (X_index == 0)
                 {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
-                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
-                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
+                    Y += ((double) carry(0)) *
+                        ((((double) bins[0]) / 6.0) * scale_down * EXPANSION);
+                    Y += ((double) carry(inccarX)) *
+                        ((((double) bins[1]) / 6.0) * scale_down);
+                    Y += ((double) primary(0) - (double) bins[0]) * scale_down *
+                        EXPANSION;
                     i = 2;
                 }
                 else
                 {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
+                    Y += ((double) carry(0)) *
+                        (((double) bins[0] / 6.0) * scale_down);
                     i = 1;
                 }
                 for (; i < scaled; i++)
                 {
-                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
-                    Y +=
-                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
+                    Y += ((double) carry(i * inccarX)) *
+                        (((double) bins[i] / 6.0) * scale_down);
+                    Y += ((double) primary((i - 1) * incpriX) -
+                             (double) (bins[i - 1])) *
+                        scale_down;
                 }
                 if (i == FOLD)
                 {
-                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
+                    Y += ((double) primary((FOLD - 1) * incpriX) -
+                             (double) (bins[FOLD - 1])) *
                         scale_down;
                     return Y * scale_up;
                 }
@@ -597,20 +648,23 @@ namespace hpx::parallel::detail::rfa {
                 Y *= scale_up;
                 for (; i < FOLD; i++)
                 {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += primary((i - 1) * incpriX) - bins[i - 1];
+                    Y += ((double) carry(i * inccarX)) *
+                        ((double) bins[i] / 6.0);
+                    Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
                 }
-                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
+                Y += ((double) primary((FOLD - 1) * incpriX) -
+                    ((double) bins[FOLD - 1]));
             }
             else
             {
-                Y += carry(0) * (bins[0] / 6.0);
+                Y += ((double) carry(0)) * ((double) bins[0] / 6.0);
                 for (i = 1; i < FOLD; i++)
                 {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
+                    Y += ((double) carry(i * inccarX)) *
+                        ((double) bins[i] / 6.0);
+                    Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
                 }
-                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+                Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
             }
             return Y;
         }
@@ -627,7 +681,7 @@ namespace hpx::parallel::detail::rfa {
             if (ISNANINF(primary(0)))
                 return primary(0);
             if (ISZERO(primary(0)))
-                return 0.0;
+                return 0.0f;
 
             //Note that the following order of summation is in order of decreasing
             //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
@@ -636,20 +690,22 @@ namespace hpx::parallel::detail::rfa {
             const auto* const bins = binned_bins(X_index);
             if (X_index == 0)
             {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
+                Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0) *
                     (double) EXPANSION;
-                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
+                Y += (double) carry(inccarX) *
+                    (double) (((double) bins[1]) / 6.0);
                 Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
                 i = 2;
             }
             else
             {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0);
+                Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0);
                 i = 1;
             }
             for (; i < FOLD; i++)
             {
-                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
+                Y += (double) carry(i * inccarX) *
+                    (double) (((double) bins[i]) / 6.0);
                 Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
             }
             Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
@@ -694,8 +750,10 @@ namespace hpx::parallel::detail::rfa {
             if (shift > 0)
             {
                 const auto* const bins = binned_bins(Y_index);
-                //shift Y upwards and add X to Y
-#pragma unroll
+//shift Y upwards and add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = FOLD - 1; i >= 1; i--)
                 {
                     if (i < shift)
@@ -705,7 +763,9 @@ namespace hpx::parallel::detail::rfa {
                     carry(i * inccarY) =
                         x.carry(i * inccarX) + carry((i - shift) * inccarY);
                 }
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD; i++)
                 {
                     if (i == shift)
@@ -717,8 +777,10 @@ namespace hpx::parallel::detail::rfa {
             else if (shift < 0)
             {
                 const auto* const bins = binned_bins(X_index);
-                //shift X upwards and add X to Y
-#pragma unroll
+//shift X upwards and add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD; i++)
                 {
                     if (i < -shift)
@@ -731,8 +793,10 @@ namespace hpx::parallel::detail::rfa {
             else if (shift == 0)
             {
                 const auto* const bins = binned_bins(X_index);
-                // add X to Y
-#pragma unroll
+// add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD; i++)
                 {
                     primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
@@ -771,7 +835,7 @@ namespace hpx::parallel::detail::rfa {
         }
 
         ///Return the endurance of the binned fp
-        constexpr int endurance() const
+        constexpr size_t endurance() const
         {
             return ENDURANCE;
         }
@@ -867,11 +931,11 @@ namespace hpx::parallel::detail::rfa {
         {
             if (std::is_same_v<ftype, float>)
             {
-                return binned_conv_single(1, 1);
+                return static_cast<ftype>(binned_conv_single(1, 1));
             }
             else
             {
-                return binned_conv_double(1, 1);
+                return static_cast<ftype>(binned_conv_double(1, 1));
             }
         }
 
@@ -888,7 +952,8 @@ namespace hpx::parallel::detail::rfa {
         {
             const double X = std::abs(max_abs_val);
             const double S = std::abs(binned_sum);
-            return static_cast<ftype>(max(X, std::ldexp(0.5, MIN_EXP - 1)) *
+            return static_cast<ftype>(
+                (std::max)(X, std::ldexp(0.5, MIN_EXP - 1)) *
                     std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
                 ((7.0 * EPSILON) /
                     (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
@@ -973,7 +1038,7 @@ namespace hpx::parallel::detail::rfa {
             T max_abs_val = input[0];
             for (size_t i = 0; i < N; i++)
             {
-                max_abs_val = max(max_abs_val, std::abs(input[i]));
+                max_abs_val = (std::max)(max_abs_val, std::abs(input[i]));
             }
             add(input, N, max_abs_val);
         }
@@ -1142,4 +1207,4 @@ namespace hpx::parallel::detail::rfa {
         }
     };
 
-}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
+}    // namespace hpx::parallel::detail::rfa
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 92dd2e7f3dc2..5a06c509efdc 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -4,8 +4,6 @@
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#pragma once
-
 #include <hpx/init.hpp>
 #include <hpx/modules/testing.hpp>
 #include <hpx/parallel/algorithms/detail/rfa.hpp>
@@ -19,6 +17,7 @@
 #include <limits>
 #include <numeric>
 #include <random>
+#include <string>
 #include <vector>
 
 #include "test_utils.hpp"
@@ -27,11 +26,12 @@ int seed = std::random_device{}();
 std::mt19937 gen(seed);
 
 template <typename T>
-T get_rand(
-    T LO = std::numeric_limits<T>::min(), T HI = std::numeric_limits<T>::max())
+T get_rand(T LO = (std::numeric_limits<T>::min)(),
+    T HI = (std::numeric_limits<T>::max)())
 {
     return LO +
-        static_cast<T>(std::rand()) / (static_cast<T>(RAND_MAX / (HI - LO)));
+        static_cast<T>(std::rand()) /
+        (static_cast<T>(static_cast<T>((RAND_MAX)) / (HI - LO)));
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -42,10 +42,12 @@ void test_reduce1(IteratorTag)
 {
     // check if different type for deterministic and nondeeterministic
     // and same result i.e. correct computation
-    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using base_iterator_det =
+        typename std::vector<FloatTypeDeterministic>::iterator;
     using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
 
-    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using base_iterator_ndet =
+        typename std::vector<FloatTypeNonDeterministic>::iterator;
     using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
 
     std::vector<FloatTypeDeterministic> deterministic(LEN);
@@ -75,8 +77,8 @@ void test_reduce1(IteratorTag)
     FloatTypeNonDeterministic r3 = std::accumulate(
         nondeterministic.begin(), nondeterministic.end(), val_non_det);
 
-    HPX_TEST_EQ(r1, r3);
-    HPX_TEST_EQ(r2, r3);
+    HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r1), r3);
+    HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r2), r3);
 }
 
 template <typename IteratorTag, typename FloatTypeDeterministic,
@@ -85,7 +87,8 @@ void test_reduce_determinism(IteratorTag)
 {
     // check if different type for deterministic and nondeeterministic
     // and same result
-    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using base_iterator_det =
+        typename std::vector<FloatTypeDeterministic>::iterator;
     using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
 
     constexpr FloatTypeDeterministic num_bounds_det =
@@ -129,7 +132,8 @@ template <typename IteratorTag, typename FloatTypeNonDeterministic,
     size_t LEN = 10007>
 void test_orig_reduce_determinism(IteratorTag)
 {
-    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using base_iterator_ndet =
+        typename std::vector<FloatTypeNonDeterministic>::iterator;
     using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
 
     constexpr auto num_bounds_ndet =

From c205a13231f4af20fa33eae35c89852e5029c1c6 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Wed, 1 Jan 2025 21:36:17 +0530
Subject: [PATCH 9/9] Fix parallel deterministic reduce and add benchmarks

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           | 79 +++++++++++++++++++
 .../algorithms/reduce_deterministic.hpp       | 24 +++---
 .../benchmark_reduce_deterministic.cpp        | 21 +++--
 3 files changed, 108 insertions(+), 16 deletions(-)

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index b37730889172..ffce8febaae3 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -65,6 +65,71 @@ namespace hpx::parallel::detail {
         }
     };
 
+    template <typename ExPolicy>
+    struct sequential_reduce_deterministic_rfa_t final
+      : hpx::functional::detail::tag_fallback<
+            sequential_reduce_deterministic_rfa_t<ExPolicy>>
+    {
+    private:
+        template <typename InIterB, typename T>
+        friend constexpr hpx::parallel::detail::rfa::
+            ReproducibleFloatingAccumulator<T>
+            tag_fallback_invoke(sequential_reduce_deterministic_rfa_t,
+                ExPolicy&&, InIterB first, std::size_t partition_size, T init,
+                std::true_type&&)
+        {
+            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+
+            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+            rfa.set_max_abs_val(init);
+            rfa.unsafe_add(init);
+            rfa.renorm();
+            size_t count = 0;
+            T max_val = std::abs(*first);
+            std::size_t partition_size_lim = 0;
+            for (auto e = first; partition_size_lim <= partition_size;
+                partition_size_lim++, e++)
+            {
+                T temp_max_val = std::abs(static_cast<T>(*e));
+                if (max_val < temp_max_val)
+                {
+                    rfa.set_max_abs_val(temp_max_val);
+                    max_val = temp_max_val;
+                }
+                rfa.unsafe_add(*e);
+                count++;
+                if (count == rfa.endurance())
+                {
+                    rfa.renorm();
+                    count = 0;
+                }
+            }
+            return rfa;
+        }
+
+        template <typename InIterB, typename T>
+        friend constexpr T tag_fallback_invoke(
+            sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first,
+            std::size_t partition_size, T init, std::false_type&&)
+        {
+            hpx::parallel::detail::rfa::RFA_bins<typename T::ftype> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+
+            T rfa;
+            rfa += init;
+            std::size_t partition_size_lim = 0;
+            for (auto e = first; partition_size_lim <= partition_size;
+                partition_size_lim++, e++)
+            {
+                rfa += (*e);
+            }
+            return rfa;
+        }
+    };
+
 #if !defined(HPX_COMPUTE_DEVICE_CODE)
     template <typename ExPolicy>
     inline constexpr sequential_reduce_deterministic_t<ExPolicy>
@@ -80,4 +145,18 @@ namespace hpx::parallel::detail {
     }
 #endif
 
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+    template <typename ExPolicy>
+    inline constexpr sequential_reduce_deterministic_rfa_t<ExPolicy>
+        sequential_reduce_deterministic_rfa =
+            sequential_reduce_deterministic_rfa_t<ExPolicy>{};
+#else
+    template <typename ExPolicy, typename... Args>
+    HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic_rfa(
+        Args&&... args)
+    {
+        return sequential_reduce_deterministic_rfa_t<ExPolicy>{}(
+            std::forward<Args>(args)...);
+    }
+#endif
 }    // namespace hpx::parallel::detail
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
index 996865d519c5..1e59ddba735a 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -407,35 +407,37 @@ namespace hpx::parallel {
                 ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init,
                 Reduce&& r)
             {
+                (void)r;
                 if (first == last)
                 {
                     return util::detail::algorithm_result<ExPolicy, T>::get(
                         HPX_FORWARD(T_, init));
                 }
 
-                auto f1 = [r, policy](
-                              FwdIterB part_begin, std::size_t part_size)
+                auto f1 = [policy](FwdIterB part_begin, std::size_t part_size)
                     -> hpx::parallel::detail::rfa::
                         ReproducibleFloatingAccumulator<T_> {
-                            T val = *part_begin;
+                            T_ val = *part_begin;
                             return hpx::parallel::detail::
-                                sequential_reduce_deterministic<ExPolicy>(
+                                sequential_reduce_deterministic_rfa<ExPolicy>(
                                     HPX_FORWARD(ExPolicy, policy), ++part_begin,
-                                    --part_size, HPX_MOVE(val), r);
+                                    --part_size, HPX_MOVE(val),
+                                    std::true_type{});
                         };
 
-                return util::partitioner<ExPolicy, T,
+                return util::partitioner<ExPolicy, T_,
                     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
                         T_>>::call(HPX_FORWARD(ExPolicy, policy), first,
                     detail::distance(first, last), HPX_MOVE(f1),
-                    hpx::unwrapping([init = HPX_FORWARD(T_, init),
-                                        r = HPX_FORWARD(Reduce, r),
-                                        policy](auto&& results) -> T {
+                    hpx::unwrapping([policy](auto&& results) -> T_ {
                         return hpx::parallel::detail::
-                            sequential_reduce_deterministic<ExPolicy>(
+                            sequential_reduce_deterministic_rfa<ExPolicy>(
                                 HPX_FORWARD(ExPolicy, policy),
                                 hpx::util::begin(results),
-                                hpx::util::size(results), init, r)
+                                hpx::util::size(results),
+                                hpx::parallel::detail::rfa::
+                                    ReproducibleFloatingAccumulator<T_>{},
+                                std::false_type{})
                                 .conv();
                     }));
             }
diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
index daaee2b1269b..5a267dd6a634 100644
--- a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
@@ -5,6 +5,7 @@
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
 #include <hpx/config.hpp>
+#include <cstddef>
 
 #if !defined(HPX_COMPUTE_DEVICE_CODE)
 #include <hpx/algorithm.hpp>
@@ -33,15 +34,15 @@ T get_rand(T LO = (std::numeric_limits<T>::min)(),
 
 ///////////////////////////////////////////////////////////////////////////////
 
-void bench_reduce_deterministic(
+void bench_reduce_deterministic(const auto& policy,
     const auto& deterministic_shuffled, const auto& val_det, const auto& op)
 {
     // check if different type for deterministic and nondeeterministic
     // and same result
 
     auto r1_shuffled =
-        hpx::reduce_deterministic((std::begin(deterministic_shuffled)),
-            (std::end(deterministic_shuffled)), val_det, op);
+        hpx::reduce_deterministic(policy, std::begin(deterministic_shuffled),
+            std::end(deterministic_shuffled), val_det, op);
 
     HPX_UNUSED(r1_shuffled);
 }
@@ -61,6 +62,7 @@ int hpx_main(hpx::program_options::variables_map& vm)
     std::srand(seed);
 
     auto test_count = vm["test_count"].as<int>();
+    std::size_t vector_size = vm["vector-size"].as<std::size_t>();
 
     hpx::util::perftests_init(vm);
 
@@ -74,7 +76,7 @@ int hpx_main(hpx::program_options::variables_map& vm)
 
     {
         using FloatTypeDeterministic = float;
-        std::size_t LEN = 10000;
+        std::size_t LEN = vector_size;
 
         constexpr FloatTypeDeterministic num_bounds_det =
             std::is_same_v<FloatTypeDeterministic, float> ? 1000.0 : 1000000.0;
@@ -113,7 +115,14 @@ int hpx_main(hpx::program_options::variables_map& vm)
         {
             hpx::util::perftests_report(
                 "reduce deterministic", "seq", test_count, [&]() {
-                    bench_reduce_deterministic(
+                    bench_reduce_deterministic(hpx::execution::seq,
+                        deterministic_shuffled, val_det, op);
+                });
+        }
+        {
+            hpx::util::perftests_report(
+                "reduce deterministic", "par", test_count, [&]() {
+                    bench_reduce_deterministic(hpx::execution::par,
                         deterministic_shuffled, val_det, op);
                 });
         }
@@ -135,6 +144,8 @@ int main(int argc, char* argv[])
     cmdline.add_options()
         ("test_count", value<int>()->default_value(100),
             "number of tests to be averaged")
+        ("vector-size", value<std::size_t>()->default_value(1000000),
+            "number of elements to be reduced")
         ;
     // clang-format on