diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml
index f778a6b117d..caec2af3ac2 100644
--- a/.github/workflows/macos_debug_fetch_hwloc.yml
+++ b/.github/workflows/macos_debug_fetch_hwloc.yml
@@ -19,7 +19,6 @@ jobs:
       run: |
           brew install --overwrite python-tk && \
           brew install --overwrite boost gperftools ninja autoconf automake && \
-          autoreconf -f -i \
           brew upgrade cmake
     - name: Configure
       shell: bash
diff --git a/libs/core/algorithms/CMakeLists.txt b/libs/core/algorithms/CMakeLists.txt
index 6fcfed897e2..9090345722d 100644
--- a/libs/core/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/CMakeLists.txt
@@ -37,7 +37,9 @@ set(algorithms_headers
     hpx/parallel/algorithms/detail/parallel_stable_sort.hpp
     hpx/parallel/algorithms/detail/pivot.hpp
     hpx/parallel/algorithms/detail/reduce.hpp
+    hpx/parallel/algorithms/detail/reduce_deterministic.hpp
     hpx/parallel/algorithms/detail/replace.hpp
+    hpx/parallel/algorithms/detail/rfa.hpp
     hpx/parallel/algorithms/detail/rotate.hpp
     hpx/parallel/algorithms/detail/sample_sort.hpp
     hpx/parallel/algorithms/detail/search.hpp
@@ -72,6 +74,7 @@ set(algorithms_headers
     hpx/parallel/algorithms/partition.hpp
     hpx/parallel/algorithms/reduce_by_key.hpp
     hpx/parallel/algorithms/reduce.hpp
+    hpx/parallel/algorithms/reduce_deterministic.hpp
     hpx/parallel/algorithms/remove_copy.hpp
     hpx/parallel/algorithms/remove.hpp
     hpx/parallel/algorithms/replace.hpp
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
new file mode 100644
index 00000000000..ffce8febaae
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -0,0 +1,162 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <hpx/config.hpp>
+#include <hpx/functional/detail/tag_fallback_invoke.hpp>
+#include <hpx/functional/invoke.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
+#include <hpx/parallel/util/loop.hpp>
+
+#include <cstddef>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include "rfa.hpp"
+
+namespace hpx::parallel::detail {
+
+    template <typename ExPolicy>
+    struct sequential_reduce_deterministic_t final
+      : hpx::functional::detail::tag_fallback<
+            sequential_reduce_deterministic_t<ExPolicy>>
+    {
+    private:
+        template <typename InIterB, typename InIterE, typename T,
+            typename Reduce>
+        friend constexpr T tag_fallback_invoke(
+            sequential_reduce_deterministic_t, ExPolicy&&, InIterB first,
+            InIterE last, T init, Reduce&& r)
+        {
+            /// TODO: Put constraint on Reduce to be a binary plus operator
+            (void) r;
+            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+
+            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+            rfa.set_max_abs_val(init);
+            rfa.unsafe_add(init);
+            rfa.renorm();
+            size_t count = 0;
+            T max_val = std::abs(*first);
+            for (auto e = first; e != last; ++e)
+            {
+                T temp_max_val = std::abs(static_cast<T>(*e));
+                if (max_val < temp_max_val)
+                {
+                    rfa.set_max_abs_val(temp_max_val);
+                    max_val = temp_max_val;
+                }
+                rfa.unsafe_add(*e);
+                count++;
+                if (count == rfa.endurance())
+                {
+                    rfa.renorm();
+                    count = 0;
+                }
+            }
+            return rfa.conv();
+        }
+    };
+
+    template <typename ExPolicy>
+    struct sequential_reduce_deterministic_rfa_t final
+      : hpx::functional::detail::tag_fallback<
+            sequential_reduce_deterministic_rfa_t<ExPolicy>>
+    {
+    private:
+        template <typename InIterB, typename T>
+        friend constexpr hpx::parallel::detail::rfa::
+            ReproducibleFloatingAccumulator<T>
+            tag_fallback_invoke(sequential_reduce_deterministic_rfa_t,
+                ExPolicy&&, InIterB first, std::size_t partition_size, T init,
+                std::true_type&&)
+        {
+            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+
+            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+            rfa.set_max_abs_val(init);
+            rfa.unsafe_add(init);
+            rfa.renorm();
+            size_t count = 0;
+            T max_val = std::abs(*first);
+            std::size_t partition_size_lim = 0;
+            for (auto e = first; partition_size_lim <= partition_size;
+                partition_size_lim++, e++)
+            {
+                T temp_max_val = std::abs(static_cast<T>(*e));
+                if (max_val < temp_max_val)
+                {
+                    rfa.set_max_abs_val(temp_max_val);
+                    max_val = temp_max_val;
+                }
+                rfa.unsafe_add(*e);
+                count++;
+                if (count == rfa.endurance())
+                {
+                    rfa.renorm();
+                    count = 0;
+                }
+            }
+            return rfa;
+        }
+
+        template <typename InIterB, typename T>
+        friend constexpr T tag_fallback_invoke(
+            sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first,
+            std::size_t partition_size, T init, std::false_type&&)
+        {
+            hpx::parallel::detail::rfa::RFA_bins<typename T::ftype> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+
+            T rfa;
+            rfa += init;
+            std::size_t partition_size_lim = 0;
+            for (auto e = first; partition_size_lim <= partition_size;
+                partition_size_lim++, e++)
+            {
+                rfa += (*e);
+            }
+            return rfa;
+        }
+    };
+
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+    template <typename ExPolicy>
+    inline constexpr sequential_reduce_deterministic_t<ExPolicy>
+        sequential_reduce_deterministic =
+            sequential_reduce_deterministic_t<ExPolicy>{};
+#else
+    template <typename ExPolicy, typename... Args>
+    HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic(
+        Args&&... args)
+    {
+        return sequential_reduce_deterministic_t<ExPolicy>{}(
+            std::forward<Args>(args)...);
+    }
+#endif
+
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+    template <typename ExPolicy>
+    inline constexpr sequential_reduce_deterministic_rfa_t<ExPolicy>
+        sequential_reduce_deterministic_rfa =
+            sequential_reduce_deterministic_rfa_t<ExPolicy>{};
+#else
+    template <typename ExPolicy, typename... Args>
+    HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic_rfa(
+        Args&&... args)
+    {
+        return sequential_reduce_deterministic_rfa_t<ExPolicy>{}(
+            std::forward<Args>(args)...);
+    }
+#endif
+}    // namespace hpx::parallel::detail
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
new file mode 100644
index 00000000000..fa9142cdf80
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -0,0 +1,1210 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// ---------------------------------------------------------------------------
+// This file has been taken from
+// https://github.com/maddyscientist/reproducible_floating_sums commit
+// b5a065741d4ea459437ca004b508de9dcb6a3e52. The boost copyright has been added
+// to this file in accordance with the dual license terms for the Reproducible
+// Floating-Point Summations and conformance with the HPX policy
+// https://github.com/maddyscientist/reproducible_floating_sums/blob/feature/cuda/LICENSE.md
+// ---------------------------------------------------------------------------
+//
+/// Copyright 2022 Richard Barnes, Peter Ahrens, James Demmel
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to deal
+/// in the Software without restriction, including without limitation the rights
+/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+/// copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+/// The above copyright notice and this permission notice shall be included in
+/// all copies or substantial portions of the Software.
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
+//Reproducible Floating Point Accumulations via Binned Floating Point
+//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
+//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
+//
+//The code accomplishes several objectives:
+//
+//1. Reproducible summation, independent of summation order, assuming only a
+//   subset of the IEEE 754 Floating Point Standard
+//
+//2. Has accuracy at least as good as conventional summation, and tunable
+//
+//3. Handles overflow, underflow, and other exceptions reproducibly.
+//
+//4. Makes only one read-only pass over the summands.
+//
+//5. Requires only one parallel reduction.
+//
+//6. Uses minimal memory (6 doubles per accumulator with fold=3).
+//
+//7. Relatively easy to use
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <hpx/config.hpp>
+
+namespace hpx::parallel::detail::rfa {
+    template <typename F>
+    struct type4
+    {
+        F x;
+        F y;
+        F z;
+        F w;
+    };
+
+    template <typename F>
+    struct type2
+    {
+        F x;
+        F y;
+    };
+    using float4 = type4<float>;
+    using double4 = type4<double>;
+    using float2 = type2<float>;
+    using double2 = type2<double>;
+
+    auto abs_max(float4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<float> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<double> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(float2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<float> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<double> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+// disable zero checks
+#define DISABLE_ZERO
+
+// disable nan / infinity checks
+#define DISABLE_NANINF
+
+// jump table for indexing into data
+#define MAX_JUMP 5
+    static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max");
+
+    template <typename Real>
+    inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
+    {
+        return std::ldexp(arg, exp);
+        // while (arg == 0)
+        // {
+        //     return arg;
+        // }
+        // while (exp > 0)
+        // {
+        //     arg *= static_cast<Real>(2);
+        //     --exp;
+        // }
+        // while (exp < 0)
+        // {
+        //     arg /= static_cast<Real>(2);
+        //     ++exp;
+        // }
+
+        // return arg;
+    }
+
+    template <class ftype>
+    struct RFA_bins
+    {
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+
+        ///The binned floating-point reference bins
+        std::array<ftype, MAXINDEX + MAXFOLD> bins = {};
+
+        constexpr ftype& operator[](int d)
+        {
+            return bins[d];
+        }
+
+        void initialize_bins()
+        {
+            if constexpr (std::is_same_v<ftype, float>)
+            {
+                bins[0] = std::ldexp(0.75, MAX_EXP);
+            }
+            else
+            {
+                bins[0] = 2.0 * std::ldexp(0.75, MAX_EXP - 1);
+            }
+
+            for (int index = 1; index <= MAXINDEX; index++)
+            {
+                bins[index] = std::ldexp(0.75,
+                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
+            }
+            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
+            {
+                bins[index] = bins[index - 1];
+            }
+        }
+    };
+
+    static char __rfa_bin_host_buffer__[sizeof(RFA_bins<double>)];
+
+    ///Class to hold a reproducible summation of the numbers passed to it
+    ///
+    ///@param ftype Floating-point data type; either `float` or `double
+    ///@param FOLD  The fold; use 3 as a default unless you understand it.
+    template <class ftype_, int FOLD_ = 3,
+        typename std::enable_if_t<std::is_floating_point<ftype_>::value>* =
+            nullptr>
+    class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator
+    {
+    public:
+        using ftype = ftype_;
+        static constexpr int FOLD = FOLD_;
+
+    private:
+        std::array<ftype, 2 * FOLD> data = {{0}};
+
+        ///Floating-point precision bin width
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+        ///Binned floating-point compression factor
+        ///This factor is used to scale down inputs before deposition into the bin of
+        ///highest index
+        static constexpr auto COMPRESSION =
+            1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1));
+        ///Binned double precision expansion factor
+        ///This factor is used to scale up inputs after deposition into the bin of
+        ///highest index
+        static constexpr auto EXPANSION =
+            1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1));
+        static constexpr auto EXP_BIAS = MAX_EXP - 2;
+        static constexpr auto EPSILON = std::numeric_limits<ftype>::epsilon();
+        ///Binned floating-point deposit endurance
+        ///The number of deposits that can be performed before a renorm is necessary.
+        ///Applies also to binned complex double precision.
+        static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
+
+        ///Return a binned floating-point reference bin
+        inline const ftype* binned_bins(const int x) const
+        {
+            return &reinterpret_cast<RFA_bins<ftype>&>(
+                __rfa_bin_host_buffer__)[x];
+        }
+
+        ///Get the bit representation of a float
+        static inline uint32_t& get_bits(float& x)
+        {
+            return *reinterpret_cast<uint32_t*>(&x);
+        }
+        ///Get the bit representation of a double
+        static inline uint64_t& get_bits(double& x)
+        {
+            return *reinterpret_cast<uint64_t*>(&x);
+        }
+        ///Get the bit representation of a const float
+        static inline uint32_t get_bits(const float& x)
+        {
+            return *reinterpret_cast<const uint32_t*>(&x);
+        }
+        ///Get the bit representation of a const double
+        static inline uint64_t get_bits(const double& x)
+        {
+            return *reinterpret_cast<const uint64_t*>(&x);
+        }
+
+        ///Return primary vector value const ref
+        inline const ftype& primary(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[4];
+                default:
+                    return data[FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[i];
+            }
+        }
+
+        ///Return carry vector value const ref
+        inline const ftype& carry(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[FOLD + 0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[FOLD + 1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[FOLD + 2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[FOLD + 3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[FOLD + 4];
+                default:
+                    return data[2 * FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[FOLD + i];
+            }
+        }
+
+        ///Return primary vector value ref
+        inline ftype& primary(int i)
+        {
+            const auto& c = *this;
+            return const_cast<ftype&>(c.primary(i));
+        }
+
+        ///Return carry vector value ref
+        inline ftype& carry(int i)
+        {
+            const auto& c = *this;
+            return const_cast<ftype&>(c.carry(i));
+        }
+
+#ifdef DISABLE_ZERO
+        static inline constexpr bool ISZERO(const ftype)
+        {
+            return false;
+        }
+#else
+        static inline constexpr bool ISZERO(const ftype x)
+        {
+            return x == 0.0;
+        }
+#endif
+
+#ifdef DISABLE_NANINF
+        static inline constexpr int ISNANINF(const ftype)
+        {
+            return false;
+        }
+#else
+        static inline constexpr int ISNANINF(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
+                ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
+        }
+#endif
+
+        static inline constexpr int EXP(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1);
+        }
+
+        ///Get index of float-point precision
+        ///The index of a non-binned type is the smallest index a binned type would
+        ///need to have to sum it reproducibly. Higher indices correspond to smaller
+        ///bins.
+        static inline constexpr int binned_dindex(const ftype x)
+        {
+            int exp = EXP(x);
+            if (exp == 0)
+            {
+                if (x == static_cast<ftype>(0.0))
+                {
+                    return MAXINDEX;
+                }
+                else
+                {
+                    std::frexp(x, &exp);
+                    return (std::max)((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                }
+            }
+            return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
+        }
+
+        ///Get index of manually specified binned double precision
+        ///The index of a binned type is the bin that it corresponds to. Higher
+        ///indices correspond to smaller bins.
+        inline int binned_index() const
+        {
+            return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
+                       EXP(primary(0))) /
+                BIN_WIDTH;
+        }
+
+        ///Check if index of manually specified binned floating-point is 0
+        ///A quick check to determine if the index is 0
+        inline bool binned_index0() const
+        {
+            return EXP(primary(0)) == MAX_EXP + EXP_BIAS;
+        }
+
+        ///Update manually specified binned fp with a scalar (X -> Y)
+        ///
+        ///This method updates the binned fp to an index suitable for adding numbers
+        ///with absolute value less than @p max_abs_val
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdupdate(
+            const ftype max_abs_val, const int incpriY, const int inccarY)
+        {
+            if (ISNANINF(primary(0)))
+                return;
+
+            int X_index = binned_dindex(max_abs_val);
+            if (ISZERO(primary(0)))
+            {
+                const ftype* const bins = binned_bins(X_index);
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) = bins[i];
+                    carry(i * inccarY) = 0.0;
+                }
+            }
+            else
+            {
+                int shift = binned_index() - X_index;
+                if (shift > 0)
+                {
+#if !defined(HPX_CLANG_VERSION)
+                    HPX_UNROLL
+#endif
+                    for (int i = FOLD - 1; i >= 1; i--)
+                    {
+                        if (i < shift)
+                            break;
+                        primary(i * incpriY) = primary((i - shift) * incpriY);
+                        carry(i * inccarY) = carry((i - shift) * inccarY);
+                    }
+                    const ftype* const bins = binned_bins(X_index);
+#if !defined(HPX_CLANG_VERSION)
+                    HPX_UNROLL
+#endif
+                    for (int j = 0; j < FOLD; j++)
+                    {
+                        if (j >= shift)
+                            break;
+                        primary(j * incpriY) = bins[j];
+                        carry(j * inccarY) = 0.0;
+                    }
+                }
+            }
+        }
+
+        ///Add scalar @p X to suitably binned manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y where the index of Y is
+        ///larger than the index of @p X
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        void binned_dmddeposit(const ftype X, const int incpriY)
+        {
+            ftype M;
+            ftype x = X;
+
+            if (ISNANINF(x) || ISNANINF(primary(0)))
+            {
+                primary(0) += x;
+                return;
+            }
+
+            if (binned_index0())
+            {
+                M = primary(0);
+                ftype qd = x * static_cast<ftype>(COMPRESSION);
+                auto& ql = get_bits(qd);
+                ql |= 1;
+                qd += M;
+                primary(0) = qd;
+                M -= qd;
+                auto temp_m = (double) (((double) EXPANSION) * 0.5);
+                M *= static_cast<ftype>(temp_m);
+                x += M;
+                x += M;
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
+                for (int i = 1; i < FOLD - 1; i++)
+                {
+                    M = primary(i * incpriY);
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    primary(i * incpriY) = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                primary((FOLD - 1) * incpriY) += qd;
+            }
+            else
+            {
+                ftype qd = x;
+                auto& ql = get_bits(qd);
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
+                for (int i = 0; i < FOLD - 1; i++)
+                {
+                    M = primary(i * incpriY);
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    primary(i * incpriY) = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                primary((FOLD - 1) * incpriY) += qd;
+            }
+        }
+
+        ///Renormalize manually specified binned double precision
+        ///
+        ///Renormalization keeps the primary vector within the necessary bins by
+        ///shifting over to the carry vector
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        inline void binned_dmrenorm(const int incpriX, const int inccarX)
+        {
+            if (ISZERO(primary(0)) || ISNANINF(primary(0)))
+                return;
+
+            for (int i = 0; i < FOLD; i++)
+            {
+                auto tmp_renormd = primary(i * incpriX);
+                auto& tmp_renorml = get_bits(tmp_renormd);
+
+                carry(i * inccarX) +=
+                    (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
+
+                tmp_renorml &= ~(1ull << (MANT_DIG - 3));
+                tmp_renorml |= 1ull << (MANT_DIG - 2);
+                primary(i * incpriX) = tmp_renormd;
+            }
+        }
+
+        ///Add scalar to manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdadd(const ftype X, const int incpriY, const int inccarY)
+        {
+            binned_dmdupdate(X, incpriY, inccarY);
+            binned_dmddeposit(X, incpriY);
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Convert manually specified binned fp to native double-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        double binned_conv_double(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+
+            if (ISNANINF(primary(0)))
+                return (double) primary(0);
+            if (ISZERO(primary(0)))
+                return 0.0;
+
+            double Y = 0.0;
+            double scale_down;
+            double scale_up;
+            int scaled;
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
+            {
+                scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
+                scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
+                scaled = (std::max)(
+                    (std::min)(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
+                if (X_index == 0)
+                {
+                    Y += ((double) carry(0)) *
+                        ((((double) bins[0]) / 6.0) * scale_down * EXPANSION);
+                    Y += ((double) carry(inccarX)) *
+                        ((((double) bins[1]) / 6.0) * scale_down);
+                    Y += ((double) primary(0) - (double) bins[0]) * scale_down *
+                        EXPANSION;
+                    i = 2;
+                }
+                else
+                {
+                    Y += ((double) carry(0)) *
+                        (((double) bins[0] / 6.0) * scale_down);
+                    i = 1;
+                }
+                for (; i < scaled; i++)
+                {
+                    Y += ((double) carry(i * inccarX)) *
+                        (((double) bins[i] / 6.0) * scale_down);
+                    Y += ((double) primary((i - 1) * incpriX) -
+                             (double) (bins[i - 1])) *
+                        scale_down;
+                }
+                if (i == FOLD)
+                {
+                    Y += ((double) primary((FOLD - 1) * incpriX) -
+                             (double) (bins[FOLD - 1])) *
+                        scale_down;
+                    return Y * scale_up;
+                }
+                if (std::isinf(Y * scale_up))
+                {
+                    return Y * scale_up;
+                }
+                Y *= scale_up;
+                for (; i < FOLD; i++)
+                {
+                    Y += ((double) carry(i * inccarX)) *
+                        ((double) bins[i] / 6.0);
+                    Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
+                }
+                Y += ((double) primary((FOLD - 1) * incpriX) -
+                    ((double) bins[FOLD - 1]));
+            }
+            else
+            {
+                Y += ((double) carry(0)) * ((double) bins[0] / 6.0);
+                for (i = 1; i < FOLD; i++)
+                {
+                    Y += ((double) carry(i * inccarX)) *
+                        ((double) bins[i] / 6.0);
+                    Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
+                }
+                Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+            }
+            return Y;
+        }
+
+        ///Convert manually specified binned fp to native single-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        float binned_conv_single(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+            double Y = 0.0;
+
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
+                return 0.0f;
+
+            //Note that the following order of summation is in order of decreasing
+            //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
+            //the number of carries equal to 1.
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index == 0)
+            {
+                Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0) *
+                    (double) EXPANSION;
+                Y += (double) carry(inccarX) *
+                    (double) (((double) bins[1]) / 6.0);
+                Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
+                i = 2;
+            }
+            else
+            {
+                Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0);
+                i = 1;
+            }
+            for (; i < FOLD; i++)
+            {
+                Y += (double) carry(i * inccarX) *
+                    (double) (((double) bins[i]) / 6.0);
+                Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
+            }
+            Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+
+            return (float) Y;
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        ///
+        ///@param other   Another binned fp of the same type
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdmadd(const ReproducibleFloatingAccumulator& x,
+            const int incpriX, const int inccarX, const int incpriY,
+            const int inccarY)
+        {
+            if (ISZERO(x.primary(0)))
+                return;
+
+            if (ISZERO(primary(0)))
+            {
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
+                }
+                return;
+            }
+
+            if (ISNANINF(x.primary(0)) || ISNANINF(primary(0)))
+            {
+                primary(0) += x.primary(0);
+                return;
+            }
+
+            const auto X_index = x.binned_index();
+            const auto Y_index = this->binned_index();
+            const auto shift = Y_index - X_index;
+            if (shift > 0)
+            {
+                const auto* const bins = binned_bins(Y_index);
+//shift Y upwards and add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
+                for (int i = FOLD - 1; i >= 1; i--)
+                {
+                    if (i < shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX) +
+                        (primary((i - shift) * incpriY) - bins[i - shift]);
+                    carry(i * inccarY) =
+                        x.carry(i * inccarX) + carry((i - shift) * inccarY);
+                }
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
+                for (int i = 0; i < FOLD; i++)
+                {
+                    if (i == shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
+                }
+            }
+            else if (shift < 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+//shift X upwards and add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
+                for (int i = 0; i < FOLD; i++)
+                {
+                    if (i < -shift)
+                        continue;
+                    primary(i * incpriY) +=
+                        x.primary((i + shift) * incpriX) - bins[i + shift];
+                    carry(i * inccarY) += x.carry((i + shift) * inccarX);
+                }
+            }
+            else if (shift == 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+// add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
+                    carry(i * inccarY) += x.carry(i * inccarX);
+                }
+            }
+
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        void binned_dbdbadd(const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dmdmadd(other, 1, 1, 1, 1);
+        }
+
+    public:
+        ReproducibleFloatingAccumulator() = default;
+        ReproducibleFloatingAccumulator(
+            const ReproducibleFloatingAccumulator&) = default;
+        ///Sets this binned fp equal to another binned fp
+        ReproducibleFloatingAccumulator& operator=(
+            const ReproducibleFloatingAccumulator&) = default;
+
+        ///Set the binned fp to zero
+        void zero()
+        {
+            data = {0};
+        }
+
+        ///Return the fold of the binned fp
+        constexpr int fold() const
+        {
+            return FOLD;
+        }
+
+        ///Return the endurance of the binned fp
+        constexpr size_t endurance() const
+        {
+            return ENDURANCE;
+        }
+
+        ///Returns the number of reference bins. Used for judging memory usage.
+        constexpr size_t number_of_reference_bins()
+        {
+            return std::array<ftype, MAXINDEX + MAXFOLD>::size();
+        }
+
+        ///Accumulate an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator+=(const U x)
+        {
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate-subtract an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator-=(const U x)
+        {
+            binned_dmdadd(-static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate a binned fp @p x into the binned fp.
+        ReproducibleFloatingAccumulator& operator+=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dbdbadd(other);
+            return *this;
+        }
+
+        ///Accumulate-subtract a binned fp @p x into the binned fp.
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator& operator-=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            const auto temp = -other;
+            binned_dbdbadd(temp);
+        }
+
+        ///Determines if two binned fp are equal
+        bool operator==(const ReproducibleFloatingAccumulator& other) const
+        {
+            return data == other.data;
+        }
+
+        ///Determines if two binned fp are not equal
+        bool operator!=(const ReproducibleFloatingAccumulator& other) const
+        {
+            return !operator==(other);
+        }
+
+        ///Sets this binned fp equal to the arithmetic value @p x
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator=(const U x)
+        {
+            zero();
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Returns the negative of this binned fp
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator operator-()
+        {
+            constexpr int incpriX = 1;
+            constexpr int inccarX = 1;
+            ReproducibleFloatingAccumulator temp = *this;
+            if (primary(0) != 0.0)
+            {
+                const auto* const bins = binned_bins(binned_index());
+                for (int i = 0; i < FOLD; i++)
+                {
+                    temp.primary(i * incpriX) =
+                        bins[i] - (primary(i * incpriX) - bins[i]);
+                    temp.carry(i * inccarX) = -carry(i * inccarX);
+                }
+            }
+            return temp;
+        }
+
+        ///Convert this binned fp into its native floating-point representation
+        ftype conv() const
+        {
+            if (std::is_same_v<ftype, float>)
+            {
+                return static_cast<ftype>(binned_conv_single(1, 1));
+            }
+            else
+            {
+                return static_cast<ftype>(binned_conv_double(1, 1));
+            }
+        }
+
+        ///@brief Get binned fp summation error bound
+        ///
+        ///This is a bound on the absolute error of a summation using binned types
+        ///
+        ///@param N           The number of single precision floating point summands
+        ///@param max_abs_val The summand of maximum absolute value
+        ///@param binned_sum  The value of the sum computed using binned types
+        ///@return            The absolute error bound
+        static constexpr ftype error_bound(
+            const uint64_t N, const ftype max_abs_val, const ftype binned_sum)
+        {
+            const double X = std::abs(max_abs_val);
+            const double S = std::abs(binned_sum);
+            return static_cast<ftype>(
+                (std::max)(X, std::ldexp(0.5, MIN_EXP - 1)) *
+                    std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
+                ((7.0 * EPSILON) /
+                    (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
+                        7.0 * EPSILON)) *
+                    S);
+        }
+
+        ///Add @p x to the binned fp
+        void add(const ftype x)
+        {
+            binned_dmdadd(x, 1, 1);
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last, const ftype max_abs_val)
+        {
+            binned_dmdupdate(std::abs(max_abs_val), 1, 1);
+            size_t count = 0;
+            size_t N = last - first;
+            for (; first != last; first++, count++)
+            {
+                binned_dmddeposit(static_cast<ftype>(*first), 1);
+                // first conditional allows compiler to remove the call here when possible
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last)
+        {
+            const auto max_abs_val = *std::max_element(
+                first, last, [](const auto& a, const auto& b) {
+                    return std::abs(a) < std::abs(b);
+                });
+            add(first, last, static_cast<ftype>(max_abs_val));
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void add(const T* input, const size_t N, const ftype max_abs_val)
+        {
+            if (N == 0)
+                return;
+            add(input, input + N, max_abs_val);
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void add(const T* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            T max_abs_val = input[0];
+            for (size_t i = 0; i < N; i++)
+            {
+                max_abs_val = (std::max)(max_abs_val, std::abs(input[i]));
+            }
+            add(input, N, max_abs_val);
+        }
+
+        ///Accumulate a float4 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float4& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            binned_dmddeposit(static_cast<ftype>(x.z), 1);
+            binned_dmddeposit(static_cast<ftype>(x.w), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const double2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        void add(const float4* input, const size_t N, float max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].z), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].w), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const double2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const float2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const float4* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const double2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const float2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        //////////////////////////////////////
+        //MANUAL OPERATIONS; USE WISELY
+        //////////////////////////////////////
+
+        ///Rebins for repeated accumulation of scalars with magnitude <= @p mav
+        ///
+        ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator
+        ///with `unsafe_add` after which `renorm()` must be called. See the source of
+        ///`add()` for an example
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void set_max_abs_val(const T mav)
+        {
+            binned_dmdupdate(std::abs(mav), 1, 1);
+        }
+
+        ///Add @p x to the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()`
+        void unsafe_add(const ftype x)
+        {
+            binned_dmddeposit(x, 1);
+        }
+
+        ///Renormalizes the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()` and one or
+        ///more calls to `unsafe_add()`
+        void renorm()
+        {
+            binned_dmrenorm(1, 1);
+        }
+    };
+
+}    // namespace hpx::parallel::detail::rfa
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
new file mode 100644
index 00000000000..1e59ddba735
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -0,0 +1,578 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+/// \file parallel/algorithms/reduce_deterministic.hpp
+/// \page hpx::reduce_deterministic
+/// \headerfile hpx/algorithm.hpp
+
+#pragma once
+
+#if defined(DOXYGEN)
+
+namespace hpx {
+
+    // clang-format off
+
+    /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         predicate \a f.
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    /// \tparam F           The type of the function/function object to use
+    ///                     (deduced). Unlike its sequential form, the parallel
+    ///                     overload of \a reduce requires \a F to meet the
+    ///                     requirements of \a CopyConstructible.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    /// \param f            Specifies the function (or function object) which
+    ///                     will be invoked for each of the elements in the
+    ///                     sequence specified by [first, last). This is a
+    ///                     binary predicate. The signature of this predicate
+    ///                     should be equivalent to:
+    ///                     \code
+    ///                     Ret fun(const Type1 &a, const Type1 &b);
+    ///                     \endcode \n
+    ///                     The signature does not need to have const&.
+    ///                     The types \a Type1 \a Ret must be
+    ///                     such that an object of type \a FwdIter can be
+    ///                     dereferenced and then implicitly converted to any
+    ///                     of those types.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a copy_if algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns \a T otherwise.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum over the elements given by the input range
+    ///           [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(op, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter, typename F,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init, F&& f);
+
+    /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a copy_if algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns \a T otherwise.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    util::detail::algorithm_result_t<ExPolicy, T>
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init);
+
+    /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns T otherwise (where T is the value_type of
+    ///           \a FwdIter).
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   The type of the initial value (and the result type) \a T is
+    ///         determined from the value_type of the used \a FwdIter.
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter>
+    typename hpx::parallel::util::detail::algorithm_result<ExPolicy,
+        typename std::iterator_traits<FwdIter>::value_type
+    >::type
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last);
+
+    /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         predicate \a f.
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    /// \tparam F           The type of the function/function object to use
+    ///                     (deduced). Unlike its sequential form, the parallel
+    ///                     overload of \a reduce requires \a F to meet the
+    ///                     requirements of \a CopyConstructible.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    /// \param f            Specifies the function (or function object) which
+    ///                     will be invoked for each of the elements in the
+    ///                     sequence specified by [first, last). This is a
+    ///                     binary predicate. The signature of this predicate
+    ///                     should be equivalent to:
+    ///                     \code
+    ///                     Ret fun(const Type1 &a, const Type1 &b);
+    ///                     \endcode \n
+    ///                     The signature does not need to have const&.
+    ///                     The types \a Type1 \a Ret must be
+    ///                     such that an object of type \a InIter can be
+    ///                     dereferenced and then implicitly converted to any
+    ///                     of those types.
+    ///
+    /// \returns  The \a reduce algorithm returns \a T.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum over the elements given by the input range
+    ///           [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(op, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter, typename F,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    T reduce_deterministic(FwdIter first, FwdIter last, T init, F&& f);
+
+    /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a T.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    T reduce_deterministic(FwdIter first, FwdIter last, T init);
+
+    /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    ///
+    /// \returns  The \a reduce algorithm returns \a T (where T is the
+    ///           value_type of \a FwdIter).
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   The type of the initial value (and the result type) \a T is
+    ///         determined from the value_type of the used \a FwdIter.
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter>
+    typename std::iterator_traits<FwdIter>::value_type
+    reduce_deterministic(FwdIter first, FwdIter last);
+    // clang-format on
+}    // namespace hpx
+
+#else    // DOXYGEN
+
+#include <hpx/config.hpp>
+#include <hpx/concepts/concepts.hpp>
+#include <hpx/executors/execution_policy.hpp>
+#include <hpx/iterator_support/range.hpp>
+#include <hpx/pack_traversal/unwrap.hpp>
+#include <hpx/parallel/algorithms/detail/accumulate.hpp>
+#include <hpx/parallel/algorithms/detail/dispatch.hpp>
+#include <hpx/parallel/algorithms/detail/distance.hpp>
+#include <hpx/parallel/algorithms/detail/reduce.hpp>
+#include <hpx/parallel/algorithms/detail/reduce_deterministic.hpp>
+#include <hpx/parallel/util/detail/algorithm_result.hpp>
+#include <hpx/parallel/util/detail/sender_util.hpp>
+#include <hpx/parallel/util/loop.hpp>
+#include <hpx/parallel/util/partitioner.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+namespace hpx::parallel {
+
+    ///////////////////////////////////////////////////////////////////////////
+    // reduce
+    namespace detail {
+
+        /// \cond NOINTERNAL
+        template <typename T>
+        struct reduce_deterministic
+          : public algorithm<reduce_deterministic<T>, T>
+        {
+            constexpr reduce_deterministic() noexcept
+              : algorithm<reduce_deterministic, T>("reduce_deterministic")
+            {
+            }
+
+            template <typename ExPolicy, typename InIterB, typename InIterE,
+                typename T_, typename Reduce>
+            static constexpr T sequential(ExPolicy&& policy, InIterB first,
+                InIterE last, T_&& init, Reduce&& r)
+            {
+                return hpx::parallel::detail::sequential_reduce_deterministic<
+                    ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last,
+                    HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r));
+            }
+
+            template <typename ExPolicy, typename FwdIterB, typename FwdIterE,
+                typename T_, typename Reduce>
+            static util::detail::algorithm_result_t<ExPolicy, T> parallel(
+                ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init,
+                Reduce&& r)
+            {
+                (void)r;
+                if (first == last)
+                {
+                    return util::detail::algorithm_result<ExPolicy, T>::get(
+                        HPX_FORWARD(T_, init));
+                }
+
+                auto f1 = [policy](FwdIterB part_begin, std::size_t part_size)
+                    -> hpx::parallel::detail::rfa::
+                        ReproducibleFloatingAccumulator<T_> {
+                            T_ val = *part_begin;
+                            return hpx::parallel::detail::
+                                sequential_reduce_deterministic_rfa<ExPolicy>(
+                                    HPX_FORWARD(ExPolicy, policy), ++part_begin,
+                                    --part_size, HPX_MOVE(val),
+                                    std::true_type{});
+                        };
+
+                return util::partitioner<ExPolicy, T_,
+                    hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+                        T_>>::call(HPX_FORWARD(ExPolicy, policy), first,
+                    detail::distance(first, last), HPX_MOVE(f1),
+                    hpx::unwrapping([policy](auto&& results) -> T_ {
+                        return hpx::parallel::detail::
+                            sequential_reduce_deterministic_rfa<ExPolicy>(
+                                HPX_FORWARD(ExPolicy, policy),
+                                hpx::util::begin(results),
+                                hpx::util::size(results),
+                                hpx::parallel::detail::rfa::
+                                    ReproducibleFloatingAccumulator<T_>{},
+                                std::false_type{})
+                                .conv();
+                    }));
+            }
+        };
+        /// \endcond
+    }    // namespace detail
+}    // namespace hpx::parallel
+
+namespace hpx {
+
+    ///////////////////////////////////////////////////////////////////////////
+    // CPO for hpx::reduce
+    inline constexpr struct reduce_deterministic_t final
+      : hpx::detail::tag_parallel_algorithm<reduce_deterministic_t>
+    {
+    private:
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter, typename F,
+            typename T = typename std::iterator_traits<FwdIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last, T init, F f)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init),
+                HPX_MOVE(f));
+        }
+
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter,
+            typename T = typename std::iterator_traits<FwdIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last, T init)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init),
+                std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy,
+            typename std::iterator_traits<FwdIter>::value_type>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            using value_type =
+                typename std::iterator_traits<FwdIter>::value_type;
+
+            return hpx::parallel::detail::reduce_deterministic<value_type>()
+                .call(HPX_FORWARD(ExPolicy, policy), first, last, value_type{},
+                    std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename InIter, typename F,
+            typename T = typename std::iterator_traits<InIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend T tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last, T init, F f)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                hpx::execution::seq, first, last, HPX_MOVE(init), HPX_MOVE(f));
+        }
+
+        // clang-format off
+        template <typename InIter,
+            typename T = typename std::iterator_traits<InIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend T tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last, T init)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                hpx::execution::seq, first, last, HPX_MOVE(init),
+                std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename InIter,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend typename std::iterator_traits<InIter>::value_type
+        tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            using value_type =
+                typename std::iterator_traits<InIter>::value_type;
+
+            return hpx::parallel::detail::reduce_deterministic<value_type>()
+                .call(hpx::execution::seq, first, last, value_type{},
+                    std::plus<>());
+        }
+    } reduce_deterministic{};
+}    // namespace hpx
+
+#endif    // DOXYGEN
diff --git a/libs/core/algorithms/tests/performance/CMakeLists.txt b/libs/core/algorithms/tests/performance/CMakeLists.txt
index d74788a9b47..96ce826dc74 100644
--- a/libs/core/algorithms/tests/performance/CMakeLists.txt
+++ b/libs/core/algorithms/tests/performance/CMakeLists.txt
@@ -16,6 +16,7 @@ set(benchmarks
     benchmark_partial_sort_parallel
     benchmark_partition
     benchmark_partition_copy
+    benchmark_reduce_deterministic
     benchmark_remove
     benchmark_remove_if
     benchmark_scan_algorithms
diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
new file mode 100644
index 00000000000..5a267dd6a63
--- /dev/null
+++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
@@ -0,0 +1,159 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <hpx/config.hpp>
+#include <cstddef>
+
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+#include <hpx/algorithm.hpp>
+#include <hpx/chrono.hpp>
+#include <hpx/execution.hpp>
+#include <hpx/init.hpp>
+#include <hpx/modules/testing.hpp>
+#include <hpx/parallel/algorithms/reduce.hpp>
+#include <hpx/parallel/algorithms/reduce_deterministic.hpp>
+
+#include <numeric>
+#include <random>
+#include <vector>
+
+int seed = 1000;
+std::mt19937 gen(seed);
+
+template <typename T>
+T get_rand(T LO = (std::numeric_limits<T>::min)(),
+    T HI = (std::numeric_limits<T>::max)())
+{
+    return LO +
+        static_cast<T>(std::rand()) /
+        (static_cast<T>(static_cast<T>((RAND_MAX)) / (HI - LO)));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void bench_reduce_deterministic(const auto& policy,
+    const auto& deterministic_shuffled, const auto& val_det, const auto& op)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result
+
+    auto r1_shuffled =
+        hpx::reduce_deterministic(policy, std::begin(deterministic_shuffled),
+            std::end(deterministic_shuffled), val_det, op);
+
+    HPX_UNUSED(r1_shuffled);
+}
+
+void bench_reduce(const auto& policy, const auto& deterministic_shuffled,
+    const auto& val_det, const auto& op)
+{
+    auto r = hpx::reduce(policy, (std::begin(deterministic_shuffled)),
+        (std::end(deterministic_shuffled)), val_det, op);
+
+    HPX_UNUSED(r);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+int hpx_main(hpx::program_options::variables_map& vm)
+{
+    std::srand(seed);
+
+    auto test_count = vm["test_count"].as<int>();
+    std::size_t vector_size = vm["vector-size"].as<std::size_t>();
+
+    hpx::util::perftests_init(vm);
+
+    // verify that input is within domain of program
+    if (test_count == 0 || test_count < 0)
+    {
+        std::cerr << "test_count cannot be zero or negative...\n" << std::flush;
+        hpx::local::finalize();
+        return -1;
+    }
+
+    {
+        using FloatTypeDeterministic = float;
+        std::size_t LEN = vector_size;
+
+        constexpr FloatTypeDeterministic num_bounds_det =
+            std::is_same_v<FloatTypeDeterministic, float> ? 1000.0 : 1000000.0;
+
+        std::vector<FloatTypeDeterministic> deterministic(LEN);
+
+        for (size_t i = 0; i < LEN; ++i)
+        {
+            deterministic[i] = get_rand<FloatTypeDeterministic>(
+                -num_bounds_det, num_bounds_det);
+        }
+
+        std::vector<FloatTypeDeterministic> deterministic_shuffled =
+            deterministic;
+
+        std::shuffle(
+            deterministic_shuffled.begin(), deterministic_shuffled.end(), gen);
+
+        FloatTypeDeterministic val_det(41.999);
+
+        auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) {
+            return v1 + v2;
+        };
+        {
+            hpx::util::perftests_report("reduce", "seq", test_count, [&]() {
+                bench_reduce(
+                    hpx::execution::seq, deterministic_shuffled, val_det, op);
+            });
+        }
+        {
+            hpx::util::perftests_report("reduce", "par", test_count, [&]() {
+                bench_reduce(
+                    hpx::execution::par, deterministic_shuffled, val_det, op);
+            });
+        }
+        {
+            hpx::util::perftests_report(
+                "reduce deterministic", "seq", test_count, [&]() {
+                    bench_reduce_deterministic(hpx::execution::seq,
+                        deterministic_shuffled, val_det, op);
+                });
+        }
+        {
+            hpx::util::perftests_report(
+                "reduce deterministic", "par", test_count, [&]() {
+                    bench_reduce_deterministic(hpx::execution::par,
+                        deterministic_shuffled, val_det, op);
+                });
+        }
+
+        hpx::util::perftests_print_times();
+    }
+
+    return hpx::local::finalize();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+int main(int argc, char* argv[])
+{
+    using namespace hpx::program_options;
+
+    options_description cmdline("usage: " HPX_APPLICATION_STRING " [options]");
+
+    // clang-format off
+    cmdline.add_options()
+        ("test_count", value<int>()->default_value(100),
+            "number of tests to be averaged")
+        ("vector-size", value<std::size_t>()->default_value(1000000),
+            "number of elements to be reduced")
+        ;
+    // clang-format on
+
+    hpx::util::perftests_cfg(cmdline);
+    hpx::local::init_params init_args;
+    init_args.desc_cmdline = cmdline;
+    init_args.cfg = {"hpx.os_threads=all"};
+
+    return hpx::local::init(hpx_main, argc, argv, init_args);
+}
+#endif
diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
index 7d3a9204530..559ee830030 100644
--- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
@@ -94,6 +94,7 @@ set(tests
     partition_copy
     reduce_
     reduce_by_key
+    reduce_deterministic
     remove
     remove1
     remove2
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
new file mode 100644
index 00000000000..5a06c509efd
--- /dev/null
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -0,0 +1,276 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <hpx/init.hpp>
+#include <hpx/modules/testing.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
+#include <hpx/parallel/algorithms/reduce.hpp>
+#include <hpx/parallel/algorithms/reduce_deterministic.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "test_utils.hpp"
+
+int seed = std::random_device{}();
+std::mt19937 gen(seed);
+
+template <typename T>
+T get_rand(T LO = (std::numeric_limits<T>::min)(),
+    T HI = (std::numeric_limits<T>::max)())
+{
+    return LO +
+        static_cast<T>(std::rand()) /
+        (static_cast<T>(static_cast<T>((RAND_MAX)) / (HI - LO)));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename IteratorTag, typename FloatTypeDeterministic,
+    typename FloatTypeNonDeterministic, size_t LEN = 10007>
+void test_reduce1(IteratorTag)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result i.e. correct computation
+    using base_iterator_det =
+        typename std::vector<FloatTypeDeterministic>::iterator;
+    using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
+
+    using base_iterator_ndet =
+        typename std::vector<FloatTypeNonDeterministic>::iterator;
+    using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
+
+    std::vector<FloatTypeDeterministic> deterministic(LEN);
+    std::vector<FloatTypeNonDeterministic> nondeterministic(LEN);
+
+    std::iota(
+        deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0));
+
+    std::iota(nondeterministic.begin(), nondeterministic.end(),
+        FloatTypeNonDeterministic(0));
+
+    FloatTypeDeterministic val_det(0);
+    FloatTypeNonDeterministic val_non_det(0);
+    auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeDeterministic r1 =
+        hpx::reduce_deterministic(iterator_det(std::begin(deterministic)),
+            iterator_det(std::end(deterministic)), val_det, op);
+
+    // verify values
+    FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic)),
+        iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+
+    FloatTypeNonDeterministic r3 = std::accumulate(
+        nondeterministic.begin(), nondeterministic.end(), val_non_det);
+
+    HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r1), r3);
+    HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r2), r3);
+}
+
+template <typename IteratorTag, typename FloatTypeDeterministic,
+    size_t LEN = 10007>
+void test_reduce_determinism(IteratorTag)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result
+    using base_iterator_det =
+        typename std::vector<FloatTypeDeterministic>::iterator;
+    using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
+
+    constexpr FloatTypeDeterministic num_bounds_det =
+        std::is_same_v<FloatTypeDeterministic, float> ? 1000.0 : 1000000.0;
+
+    std::vector<FloatTypeDeterministic> deterministic(LEN);
+
+    for (size_t i = 0; i < LEN; ++i)
+    {
+        deterministic[i] =
+            get_rand<FloatTypeDeterministic>(-num_bounds_det, num_bounds_det);
+    }
+
+    std::vector<FloatTypeDeterministic> deterministic_shuffled = deterministic;
+
+    std::shuffle(
+        deterministic_shuffled.begin(), deterministic_shuffled.end(), gen);
+
+    FloatTypeDeterministic val_det(41.999);
+
+    auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeDeterministic r1 =
+        hpx::reduce_deterministic(iterator_det(std::begin(deterministic)),
+            iterator_det(std::end(deterministic)), val_det, op);
+
+    FloatTypeDeterministic r1_shuffled = hpx::reduce_deterministic(
+        iterator_det(std::begin(deterministic_shuffled)),
+        iterator_det(std::end(deterministic_shuffled)), val_det, op);
+
+    HPX_TEST_EQ(r1,
+        r1_shuffled);    // Deterministically calculated, should always satisfy
+}
+
+/// This test function is never called because it is not guaranteed to pass
+/// It serves an important purpose to demonstrate that floating point summation
+/// is not always associative i.e. a+b+c != a+c+b
+template <typename IteratorTag, typename FloatTypeNonDeterministic,
+    size_t LEN = 10007>
+void test_orig_reduce_determinism(IteratorTag)
+{
+    using base_iterator_ndet =
+        typename std::vector<FloatTypeNonDeterministic>::iterator;
+    using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
+
+    constexpr auto num_bounds_ndet =
+        std::is_same_v<FloatTypeNonDeterministic, float> ? 1000.0f : 1000000.0f;
+
+    std::vector<FloatTypeNonDeterministic> nondeterministic(LEN);
+    for (size_t i = 0; i < LEN; ++i)
+    {
+        nondeterministic[i] = get_rand<FloatTypeNonDeterministic>(
+            -num_bounds_ndet, num_bounds_ndet);
+    }
+    std::vector<FloatTypeNonDeterministic> nondeterministic_shuffled =
+        nondeterministic;
+    std::shuffle(nondeterministic_shuffled.begin(),
+        nondeterministic_shuffled.end(), gen);
+
+    FloatTypeNonDeterministic val_non_det(41.999);
+
+    auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic)),
+        iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+    FloatTypeNonDeterministic r2_shuffled = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic_shuffled)),
+        iterator_ndet(std::end(nondeterministic_shuffled)), val_non_det, op);
+
+    FloatTypeNonDeterministic r3 = std::accumulate(
+        nondeterministic.begin(), nondeterministic.end(), val_non_det);
+    FloatTypeNonDeterministic r3_shuffled =
+        std::accumulate(nondeterministic_shuffled.begin(),
+            nondeterministic_shuffled.end(), val_non_det);
+
+    /// failed around 131 times out of 1000 on macOS arm
+    /// Floating point addition is not necessarily associative,
+    /// might fail on an architecture not yet known with much higher precision
+    HPX_TEST_NEQ(r2, r2_shuffled);
+    HPX_TEST_NEQ(r3, r3_shuffled);
+}
+
+template <typename IteratorTag>
+void test_reduce1()
+{
+    using namespace hpx::execution;
+
+    test_reduce1<IteratorTag, float, float, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, double, float, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
+}
+
+template <typename IteratorTag>
+void test_reduce2()
+{
+    using namespace hpx::execution;
+
+    test_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
+    test_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
+}
+
+// template <typename IteratorTag>
+// void test_reduce3()
+// {
+//     using namespace hpx::execution;
+
+//     test_orig_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
+//     test_orig_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
+// }
+
+void reduce_test1()
+{
+    test_reduce1<std::random_access_iterator_tag>();
+    test_reduce2<std::random_access_iterator_tag>();
+    // test_reduce3<std::random_access_iterator_tag>();
+    test_reduce1<std::forward_iterator_tag>();
+    test_reduce2<std::forward_iterator_tag>();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+int hpx_main(hpx::program_options::variables_map& vm)
+{
+    unsigned int seed = (unsigned int) std::time(nullptr);
+    bool seed_random = false;
+
+    if (vm.count("seed"))
+    {
+        seed = vm["seed"].as<unsigned int>();
+        seed_random = true;
+    }
+
+    if (vm.count("seed-random"))
+        seed_random = vm["seed-random"].as<bool>();
+
+    if (seed_random)
+    {
+        std::cout << "using seed: " << seed << std::endl;
+        std::cout << "** std::accumulate, hpx::reduce may fail due to "
+                     "non-determinism of the floating summation"
+                  << std::endl;
+        gen.seed(seed);
+        std::srand(seed);
+    }
+    else
+    {
+        gen.seed(223);
+        std::srand(223);
+    }
+
+    reduce_test1();
+
+    return hpx::local::finalize();
+}
+
+int main(int argc, char* argv[])
+{
+    // add command line option which controls the random number generator seed
+    using namespace hpx::program_options;
+    options_description desc_commandline(
+        "Usage: " HPX_APPLICATION_STRING " [options]");
+
+    desc_commandline.add_options()("seed,s", value<unsigned int>(),
+        "the random number generator seed to use for this run");
+    desc_commandline.add_options()("seed-random", value<bool>(),
+        "switch for the random number generator seed to use for this run");
+
+    // By default this test should run on all available cores
+    std::vector<std::string> const cfg = {"hpx.os_threads=all"};
+
+    // Initialize and run HPX
+    hpx::local::init_params init_args;
+    init_args.desc_cmdline = desc_commandline;
+    init_args.cfg = cfg;
+
+    HPX_TEST_EQ_MSG(hpx::local::init(hpx_main, argc, argv, init_args), 0,
+        "HPX main exited with non-zero status");
+
+    return hpx::util::report_errors();
+}