From 38788e69b664a418ab3e8aa12b255b63cffccb11 Mon Sep 17 00:00:00 2001
From: Hartmut Kaiser <hartmut.kaiser@gmail.com>
Date: Tue, 19 Nov 2024 14:48:54 -0600
Subject: [PATCH 01/10] Replace previously downloaded CDash conv.xsl with local
 version

---
 .circleci/config.yml |   7 +--
 .circleci/conv.xsl   | 121 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 5 deletions(-)
 create mode 100644 .circleci/conv.xsl
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5b856872c90..a16f9e14a81 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -105,12 +105,9 @@ jobs:
           command: |
               git clone --depth=1 file:///hpx/source-full source
       - run:
-          name: Downloading CTest XML to Junit XML
+          name: Copying CTest XML to Junit XML
           command: |
-              curl \
-                https://raw.githubusercontent.com/Kitware/CDash/master/app/cdash/tests/circle/conv.xsl \
-                --fail \
-                -o /hpx/conv.xsl
+              cp /hpx/source/.circleci/conv.xsl /hpx/conv.xsl
       - persist_to_workspace:
           root: /hpx
           paths:
diff --git a/.circleci/conv.xsl b/.circleci/conv.xsl
new file mode 100644
index 00000000000..e5b22ded600
--- /dev/null
+++ b/.circleci/conv.xsl
@@ -0,0 +1,121 @@
+<?xml version="1.0" encoding="Windows-1252" ?>
+<!--  Copyright (c) 2024 Hartmut Kaiser                                            -->
+<!--                                                                               -->
+<!--  SPDX-License-Identifier: BSL-1.0                                             -->
+<!--  Distributed under the Boost Software License, Version 1.0. (See accompanying -->
+<!--  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)        -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+    <xsl:output method="xml" indent="yes"/>
+    <xsl:template match="/Site">
+        <xsl:variable name="Name"><xsl:value-of select="@Name"/></xsl:variable>
+        <xsl:variable name="Hostname"><xsl:value-of select="@Hostname"/></xsl:variable>
+        <xsl:variable name="TestCount"><xsl:value-of select="count(//TestList/Test)"/></xsl:variable>
+        <xsl:variable name="ErrorCount"><xsl:value-of select="count(//TestList/Test[@Status='error'])"/></xsl:variable>
+        <xsl:variable name="FailureCount"><xsl:value-of select="count(//Testing/Test[@Status='failed'])"/></xsl:variable>
+        <testsuite name="{$Name}" hostname="{$Hostname}" errors="0" failures="{$FailureCount}" tests="{$TestCount}">
+            <xsl:variable name="BuildName"><xsl:value-of select="@BuildName"/></xsl:variable>
+            <xsl:variable name="BuildStamp"><xsl:value-of select="@BuildStamp"/></xsl:variable>
+            <xsl:variable name="Generator"><xsl:value-of select="@Generator"/></xsl:variable>
+            <xsl:variable name="CompilerName"><xsl:value-of select="@CompilerName"/></xsl:variable>
+            <xsl:variable name="OSName"><xsl:value-of select="@OSName"/></xsl:variable>
+            <xsl:variable name="OSRelease"><xsl:value-of select="@OSRelease"/></xsl:variable>
+            <xsl:variable name="OSVersion"><xsl:value-of select="@OSVersion"/></xsl:variable>
+            <xsl:variable name="OSPlatform"><xsl:value-of select="@OSPlatform"/></xsl:variable>
+            <xsl:variable name="Is64Bits"><xsl:value-of select="@Is64Bits"/></xsl:variable>
+            <xsl:variable name="VendorString"><xsl:value-of select="@VendorString"/></xsl:variable>
+            <xsl:variable name="VendorID"><xsl:value-of select="@VendorID"/></xsl:variable>
+            <xsl:variable name="FamilyID"><xsl:value-of select="@FamilyID"/></xsl:variable>
+            <xsl:variable name="ModelID"><xsl:value-of select="@ModelID"/></xsl:variable>
+            <xsl:variable name="ProcessorCacheSize"><xsl:value-of select="@ProcessorCacheSize"/></xsl:variable>
+            <xsl:variable name="NumberOfLogicalCPU"><xsl:value-of select="@NumberOfLogicalCPU"/></xsl:variable>
+            <xsl:variable name="NumberOfPhysicalCPU"><xsl:value-of select="@NumberOfPhysicalCPU"/></xsl:variable>
+            <xsl:variable name="TotalVirtualMemory"><xsl:value-of select="@TotalVirtualMemory"/></xsl:variable>
+            <xsl:variable name="TotalPhysicalMemory"><xsl:value-of select="@TotalPhysicalMemory"/></xsl:variable>
+            <xsl:variable name="LogicalProcessorsPerPhysical"><xsl:value-of select="@LogicalProcessorsPerPhysical"/></xsl:variable>
+            <xsl:variable name="ProcessorClockFrequency"><xsl:value-of select="@ProcessorClockFrequency"/></xsl:variable>
+            <properties>
+                <property name="BuildName" value="{$BuildName}"/>
+                <property name="BuildStamp" value="{$BuildStamp}"/>
+                <property name="Name" value="{$Name}"/>
+                <property name="Generator" value="{$Generator}"/>
+                <property name="CompilerName" value="{$CompilerName}"/>
+                <property name="OSName" value="{$OSName}"/>
+                <property name="Hostname" value="{$Hostname}"/>
+                <property name="OSRelease" value="{$OSRelease}"/>
+                <property name="OSVersion" value="{$OSVersion}"/>
+                <property name="OSPlatform" value="{$OSPlatform}"/>
+                <property name="Is64Bits" value="{$Is64Bits}"/>
+                <property name="VendorString" value="{$VendorString}"/>
+                <property name="VendorID" value="{$VendorID}"/>
+                <property name="FamilyID" value="{$FamilyID}"/>
+                <property name="ModelID" value="{$ModelID}"/>
+                <property name="ProcessorCacheSize" value="{$ProcessorCacheSize}"/>
+                <property name="NumberOfLogicalCPU" value="{$NumberOfLogicalCPU}"/>
+                <property name="NumberOfPhysicalCPU" value="{$NumberOfPhysicalCPU}"/>
+                <property name="TotalVirtualMemory" value="{$TotalVirtualMemory}"/>
+                <property name="TotalPhysicalMemory" value="{$TotalPhysicalMemory}"/>
+                <property name="LogicalProcessorsPerPhysical" value="{$LogicalProcessorsPerPhysical}"/>
+                <property name="ProcessorClockFrequency" value="{$ProcessorClockFrequency}"/>
+            </properties>
+            <xsl:apply-templates select="Testing/Test"/>
+            <system-out>
+                BuildName: <xsl:value-of select="$BuildName"/>
+                BuildStamp: <xsl:value-of select="$BuildStamp"/>
+                Name: <xsl:value-of select="$Name"/>
+                Generator: <xsl:value-of select="$Generator"/>
+                CompilerName: <xsl:value-of select="$CompilerName"/>
+                OSName: <xsl:value-of select="$OSName"/>
+                Hostname: <xsl:value-of select="$Hostname"/>
+                OSRelease: <xsl:value-of select="$OSRelease"/>
+                OSVersion: <xsl:value-of select="$OSVersion"/>
+                OSPlatform: <xsl:value-of select="$OSPlatform"/>
+                Is64Bits: <xsl:value-of select="$Is64Bits"/>
+                VendorString: <xsl:value-of select="$VendorString"/>
+                VendorID: <xsl:value-of select="$VendorID"/>
+                FamilyID: <xsl:value-of select="$FamilyID"/>
+                ModelID: <xsl:value-of select="$ModelID"/>
+                ProcessorCacheSize: <xsl:value-of select="$ProcessorCacheSize"/>
+                NumberOfLogicalCPU: <xsl:value-of select="$NumberOfLogicalCPU"/>
+                NumberOfPhysicalCPU: <xsl:value-of select="$NumberOfPhysicalCPU"/>
+                TotalVirtualMemory: <xsl:value-of select="$TotalVirtualMemory"/>
+                TotalPhysicalMemory: <xsl:value-of select="$TotalPhysicalMemory"/>
+                LogicalProcessorsPerPhysical: <xsl:value-of select="$LogicalProcessorsPerPhysical"/>
+                ProcessorClockFrequency: <xsl:value-of select="$ProcessorClockFrequency"/>
+            </system-out>
+        </testsuite>
+    </xsl:template>
+    <xsl:template match="Testing/Test">
+        <xsl:variable name="testcasename"><xsl:value-of select="Name"/></xsl:variable>
+        <xsl:variable name="testclassname"><xsl:value-of select="substring(Path,2)"/></xsl:variable>
+        <xsl:variable name="exectime">
+            <xsl:for-each select="Results/NamedMeasurement">
+                <xsl:if test="@name = 'Execution Time'">
+                    <xsl:value-of select="."/>
+                </xsl:if>
+            </xsl:for-each>
+        </xsl:variable>
+        <testcase name="{$testcasename}" classname="{$testclassname}" time="{$exectime}">
+            <xsl:if test="@Status = 'passed'"></xsl:if>
+            <xsl:if test="@Status = 'failed'">
+                <xsl:variable name="failtype">
+                    <xsl:for-each select="Results/NamedMeasurement">
+                        <xsl:if test="@name = 'Exit Code'">
+                            <xsl:value-of select="."/>
+                        </xsl:if>
+                    </xsl:for-each>
+                </xsl:variable>
+                <xsl:variable name="failcode">
+                    <xsl:for-each select="Results/NamedMeasurement">
+                        <xsl:if test="@name = 'Exit Value'">
+                            <xsl:value-of select="."/>
+                        </xsl:if>
+                    </xsl:for-each>
+                </xsl:variable>
+                <failure message="{$failtype} ({$failcode})"><xsl:value-of select="Results/Measurement/Value/text()"/></failure>
+            </xsl:if>
+            <xsl:if test="@Status = 'notrun'">
+                <skipped><xsl:value-of select="Results/Measurement/Value/text()"/></skipped>
+            </xsl:if>
+        </testcase>
+    </xsl:template>
+</xsl:stylesheet>

From fde3d47bd9d9a6f59cc92e2c1eb186bc94129f3f Mon Sep 17 00:00:00 2001
From: Hartmut Kaiser <hartmut.kaiser@gmail.com>
Date: Tue, 19 Nov 2024 15:11:07 -0600
Subject: [PATCH 02/10] Download Boost from their own archives, not from
 Sourceforge

---
 .github/workflows/macos_debug_fetch_hwloc.yml   | 1 +
 .github/workflows/windows_release_gcc_mingw.yml | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml
index caec2af3ac2..f778a6b117d 100644
--- a/.github/workflows/macos_debug_fetch_hwloc.yml
+++ b/.github/workflows/macos_debug_fetch_hwloc.yml
@@ -19,6 +19,7 @@ jobs:
       run: |
           brew install --overwrite python-tk && \
           brew install --overwrite boost gperftools ninja autoconf automake && \
+          autoreconf -f -i \
           brew upgrade cmake
     - name: Configure
       shell: bash
diff --git a/.github/workflows/windows_release_gcc_mingw.yml b/.github/workflows/windows_release_gcc_mingw.yml
index 97bb5685f0c..b36c02773ed 100644
--- a/.github/workflows/windows_release_gcc_mingw.yml
+++ b/.github/workflows/windows_release_gcc_mingw.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 The STE||AR-Group
+# Copyright (c) 2023-2024 The STE||AR-Group
 #
 # SPDX-License-Identifier: BSL-1.0
 # Distributed under the Boost Software License, Version 1.0. (See accompanying
@@ -22,7 +22,7 @@ jobs:
         choco install ninja -y
         md C:\projects
         $client = new-object System.Net.WebClient
-        $client.DownloadFile("https://master.dl.sourceforge.net/project/boost/boost/1.78.0/boost_1_78_0.7z","C:\projects\boost_1_78_0.7z")
+        $client.DownloadFile("https://archives.boost.io/release/1.78.0/source/boost_1_78_0.7z","C:\projects\boost_1_78_0.7z")
         7z x C:\projects\boost_1_78_0.7z -y -oC:\projects\boost
         cd C:\projects\boost\boost_1_78_0
         .\bootstrap.bat gcc

From 6b9083f75795068c6f2a9dbaf751cefd7c69f14f Mon Sep 17 00:00:00 2001
From: Hartmut Kaiser <hartmut.kaiser@gmail.com>
Date: Wed, 20 Nov 2024 10:13:13 -0600
Subject: [PATCH 03/10] Remove leftovers from libfabric parcelport

---
 cmake/HPX_SetupBoost.cmake           |  7 -------
 cmake/toolchains/Cray.cmake          | 21 ---------------------
 cmake/toolchains/CrayKNL.cmake       | 21 ---------------------
 cmake/toolchains/CrayKNLStatic.cmake | 21 ---------------------
 cmake/toolchains/CrayStatic.cmake    | 21 ---------------------
 5 files changed, 91 deletions(-)

diff --git a/cmake/HPX_SetupBoost.cmake b/cmake/HPX_SetupBoost.cmake
index 43360e5571d..256b56e169f 100644
--- a/cmake/HPX_SetupBoost.cmake
+++ b/cmake/HPX_SetupBoost.cmake
@@ -117,13 +117,6 @@ if(NOT TARGET hpx_dependencies_boost)
   endif()
 
   set(__boost_libraries "")
-  if(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING
-     OR HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE
-  )
-    set(__boost_libraries ${__boost_libraries} log log_setup date_time chrono
-                          thread
-    )
-  endif()
 
   if(HPX_WITH_GENERIC_CONTEXT_COROUTINES)
     # if context is needed, we should still link with boost thread and chrono
diff --git a/cmake/toolchains/Cray.cmake b/cmake/toolchains/Cray.cmake
index 83b9c051f13..e2f369f063f 100644
--- a/cmake/toolchains/Cray.cmake
+++ b/cmake/toolchains/Cray.cmake
@@ -70,27 +70,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED
     CACHE BOOL ""
 )
 
-set(HPX_WITH_PARCELPORT_LIBFABRIC
-    ON
-    CACHE BOOL ""
-)
-set(HPX_PARCELPORT_LIBFABRIC_PROVIDER
-    "gni"
-    CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc"
-)
-set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS
-    "256"
-    CACHE STRING "Max number of messages in flight at once"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE
-    OFF
-    CACHE BOOL "Custom libfabric logging flag"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING
-    OFF
-    CACHE BOOL "Libfabric parcelport logging on/off flag"
-)
-
 # We do a cross compilation here ...
 set(CMAKE_CROSSCOMPILING
     ON
diff --git a/cmake/toolchains/CrayKNL.cmake b/cmake/toolchains/CrayKNL.cmake
index 126bcc9a038..17d06245d37 100644
--- a/cmake/toolchains/CrayKNL.cmake
+++ b/cmake/toolchains/CrayKNL.cmake
@@ -68,27 +68,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED
     CACHE BOOL ""
 )
 
-set(HPX_WITH_PARCELPORT_LIBFABRIC
-    ON
-    CACHE BOOL ""
-)
-set(HPX_PARCELPORT_LIBFABRIC_PROVIDER
-    "gni"
-    CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc"
-)
-set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS
-    "256"
-    CACHE STRING "Max number of messages in flight at once"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE
-    OFF
-    CACHE BOOL "Custom libfabric logging flag"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING
-    OFF
-    CACHE BOOL "Libfabric parcelport logging on/off flag"
-)
-
 # Set the TBBMALLOC_PLATFORM correctly so that find_package(TBBMalloc) sets the
 # right hints
 set(TBBMALLOC_PLATFORM
diff --git a/cmake/toolchains/CrayKNLStatic.cmake b/cmake/toolchains/CrayKNLStatic.cmake
index 97843059eaa..76e6160ba23 100644
--- a/cmake/toolchains/CrayKNLStatic.cmake
+++ b/cmake/toolchains/CrayKNLStatic.cmake
@@ -52,27 +52,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED
     CACHE BOOL ""
 )
 
-set(HPX_WITH_PARCELPORT_LIBFABRIC
-    ON
-    CACHE BOOL ""
-)
-set(HPX_PARCELPORT_LIBFABRIC_PROVIDER
-    "gni"
-    CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc"
-)
-set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS
-    "256"
-    CACHE STRING "Max number of messages in flight at once"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE
-    OFF
-    CACHE BOOL "Custom libfabric logging flag"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING
-    OFF
-    CACHE BOOL "Libfabric parcelport logging on/off flag"
-)
-
 # Set the TBBMALLOC_PLATFORM correctly so that find_package(TBBMalloc) sets the
 # right hints
 set(TBBMALLOC_PLATFORM
diff --git a/cmake/toolchains/CrayStatic.cmake b/cmake/toolchains/CrayStatic.cmake
index f89757a2e72..6d1bc206108 100644
--- a/cmake/toolchains/CrayStatic.cmake
+++ b/cmake/toolchains/CrayStatic.cmake
@@ -62,24 +62,3 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED
     ON
     CACHE BOOL ""
 )
-
-set(HPX_WITH_PARCELPORT_LIBFABRIC
-    ON
-    CACHE BOOL ""
-)
-set(HPX_PARCELPORT_LIBFABRIC_PROVIDER
-    "gni"
-    CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc"
-)
-set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS
-    "256"
-    CACHE STRING "Max number of messages in flight at once"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE
-    OFF
-    CACHE BOOL "Custom libfabric logging flag"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING
-    OFF
-    CACHE BOOL "Libfabric parcelport logging on/off flag"
-)

From 76d847a1496a6f8f346ff19aa4ddaf32dfef1ce3 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 23 Jan 2024 11:30:59 -0600
Subject: [PATCH 04/10] [macOS] Comparison between exactly same types

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/concurrency/tests/unit/tagged_ptr.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
index b29652a3ede..d86fc577541 100644
--- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp
+++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
@@ -25,7 +25,7 @@ void tagged_ptr_test()
         i = j;
 
         HPX_TEST_EQ(i.get_ptr(), &b);
-        HPX_TEST_EQ(i.get_tag(), 1);
+        HPX_TEST_EQ(i.get_tag(), 1UL);
     }
 
     {
@@ -43,7 +43,7 @@ void tagged_ptr_test()
 
     {
         tagged_ptr<int> j(&a, max_tag);
-        HPX_TEST_EQ(j.get_next_tag(), 0);
+        HPX_TEST_EQ(j.get_next_tag(), 0UL);
     }
 
     {

From 75973a8efc7640fed711493f06cc0dbe38d1ceee Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 23 Jan 2024 11:32:20 -0600
Subject: [PATCH 05/10] [macOS] Apple does not seem to have any typedef for
 unsigned long int

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/debugging/src/print.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp
index 3d7cf5da2aa..8a01d957485 100644
--- a/libs/core/debugging/src/print.cpp
+++ b/libs/core/debugging/src/print.cpp
@@ -57,6 +57,10 @@ namespace hpx::debug {
             std::ostream&, std::int32_t const&, int);
         template HPX_CORE_EXPORT void print_dec(
             std::ostream&, std::int64_t const&, int);
+#ifdef __APPLE__
+        template HPX_CORE_EXPORT void print_dec(
+            std::ostream&, unsigned long const&, int);
+#endif
         template HPX_CORE_EXPORT void print_dec(
             std::ostream&, std::uint64_t const&, int);
 

From 5c36a4e47f0aba76889cf51fa20c2ef282e06d60 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Mon, 25 Nov 2024 11:10:30 -0600
Subject: [PATCH 06/10] Execute at_quick_exit feature test

- macOSs command line tools 16+ apparently has issues with at_quick_exit not found
- For those, this feature macro should be disabled as it cannot execute at runtime but compiles

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 cmake/HPX_AddConfigTest.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/HPX_AddConfigTest.cmake b/cmake/HPX_AddConfigTest.cmake
index 92b24ceec19..b0e1ea43745 100644
--- a/cmake/HPX_AddConfigTest.cmake
+++ b/cmake/HPX_AddConfigTest.cmake
@@ -408,7 +408,7 @@ function(hpx_check_for_cxx11_std_quick_exit)
   add_hpx_config_test(
     HPX_WITH_CXX11_STD_QUICK_EXIT
     SOURCE cmake/tests/cxx11_std_quick_exit.cpp
-    FILE ${ARGN}
+    FILE EXECUTE ${ARGN}
   )
 endfunction()
 

From 1682407fd0dfa4f1407a3c5f034f9de385efebd1 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Mon, 25 Nov 2024 11:15:59 -0600
Subject: [PATCH 07/10] Revert "[macOS] Apple does not seem to have any typedef
 for unsigned long int"

This reverts commit 75973a8efc7640fed711493f06cc0dbe38d1ceee.

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/debugging/src/print.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp
index 8a01d957485..3d7cf5da2aa 100644
--- a/libs/core/debugging/src/print.cpp
+++ b/libs/core/debugging/src/print.cpp
@@ -57,10 +57,6 @@ namespace hpx::debug {
             std::ostream&, std::int32_t const&, int);
         template HPX_CORE_EXPORT void print_dec(
             std::ostream&, std::int64_t const&, int);
-#ifdef __APPLE__
-        template HPX_CORE_EXPORT void print_dec(
-            std::ostream&, unsigned long const&, int);
-#endif
         template HPX_CORE_EXPORT void print_dec(
             std::ostream&, std::uint64_t const&, int);
 

From 6ff0c9d27530074f7d5cdd151522f2a40fdbc9f1 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Mon, 25 Nov 2024 11:16:32 -0600
Subject: [PATCH 08/10] Revert "[macOS] Comparison between exactly same types"

This reverts commit 76d847a1496a6f8f346ff19aa4ddaf32dfef1ce3.

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/concurrency/tests/unit/tagged_ptr.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
index d86fc577541..b29652a3ede 100644
--- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp
+++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
@@ -25,7 +25,7 @@ void tagged_ptr_test()
         i = j;
 
         HPX_TEST_EQ(i.get_ptr(), &b);
-        HPX_TEST_EQ(i.get_tag(), 1UL);
+        HPX_TEST_EQ(i.get_tag(), 1);
     }
 
     {
@@ -43,7 +43,7 @@ void tagged_ptr_test()
 
     {
         tagged_ptr<int> j(&a, max_tag);
-        HPX_TEST_EQ(j.get_next_tag(), 0UL);
+        HPX_TEST_EQ(j.get_next_tag(), 0);
     }
 
     {

From 520f1613edd947bdb30e3702d39c7d9bb6f96d29 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Mon, 16 Dec 2024 19:24:47 -0600
Subject: [PATCH 09/10] Add reproducible summation reduction

- Currently supports sequential version of reduction
- Tests determinism by shuffling order of floating point numbers

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           |   80 ++
 .../hpx/parallel/algorithms/detail/rfa.hpp    | 1145 +++++++++++++++++
 .../algorithms/reduce_deterministic.hpp       |  537 ++++++++
 .../tests/unit/algorithms/CMakeLists.txt      |    1 +
 .../unit/algorithms/reduce_deterministic.cpp  |  272 ++++
 5 files changed, 2035 insertions(+)
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
 create mode 100644 libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
new file mode 100644
index 00000000000..87069858f49
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -0,0 +1,80 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <hpx/config.hpp>
+#include <hpx/functional/detail/tag_fallback_invoke.hpp>
+#include <hpx/functional/invoke.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
+#include <hpx/parallel/util/loop.hpp>
+
+#include <cstddef>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include "rfa.hpp"
+
+namespace hpx::parallel::detail {
+
+    template <typename ExPolicy>
+    struct sequential_reduce_deterministic_t final
+      : hpx::functional::detail::tag_fallback<
+            sequential_reduce_deterministic_t<ExPolicy>>
+    {
+    private:
+        template <typename InIterB, typename InIterE, typename T,
+            typename Reduce>
+        friend constexpr T tag_fallback_invoke(
+            sequential_reduce_deterministic_t, ExPolicy&&, InIterB first,
+            InIterE last, T init, Reduce&& r)
+        {
+            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+
+            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+            rfa.set_max_abs_val(init);
+            rfa.unsafe_add(init);
+            rfa.renorm();
+            size_t count = 0;
+            T max_val = std::abs(*first);
+            for (auto e = first; e != last; ++e)
+            {
+                T temp_max_val = std::abs(static_cast<T>(*e));
+                if (max_val < temp_max_val)
+                {
+                    rfa.set_max_abs_val(temp_max_val);
+                    max_val = temp_max_val;
+                }
+                rfa.unsafe_add(*e);
+                count++;
+                if (count == rfa.endurance())
+                {
+                    rfa.renorm();
+                    count = 0;
+                }
+            }
+            return rfa.conv();
+        }
+    };
+
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+    template <typename ExPolicy>
+    inline constexpr sequential_reduce_deterministic_t<ExPolicy>
+        sequential_reduce_deterministic =
+            sequential_reduce_deterministic_t<ExPolicy>{};
+#else
+    template <typename ExPolicy, typename... Args>
+    HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic(
+        Args&&... args)
+    {
+        return sequential_reduce_deterministic_t<ExPolicy>{}(
+            std::forward<Args>(args)...);
+    }
+#endif
+
+}    // namespace hpx::parallel::detail
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
new file mode 100644
index 00000000000..302f823fab7
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -0,0 +1,1145 @@
+//Reproducible Floating Point Accumulations via Binned Floating Point
+//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
+//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
+//
+//The code accomplishes several objectives:
+//
+//1. Reproducible summation, independent of summation order, assuming only a
+//   subset of the IEEE 754 Floating Point Standard
+//
+//2. Has accuracy at least as good as conventional summation, and tunable
+//
+//3. Handles overflow, underflow, and other exceptions reproducibly.
+//
+//4. Makes only one read-only pass over the summands.
+//
+//5. Requires only one parallel reduction.
+//
+//6. Uses minimal memory (6 doubles per accumulator with fold=3).
+//
+//7. Relatively easy to use
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+namespace hpx::parallel::detail::rfa {
+    template <typename F>
+    struct type4
+    {
+        F x;
+        F y;
+        F z;
+        F w;
+    };
+
+    template <typename F>
+    struct type2
+    {
+        F x;
+        F y;
+    };
+    using float4 = type4<float>;
+    using double4 = type4<double>;
+    using float2 = type2<float>;
+    using double2 = type2<double>;
+
+    auto abs_max(float4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<float> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<double> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(float2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<float> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<double> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+// disable zero checks
+#define DISABLE_ZERO
+
+// disable nan / infinity checks
+#define DISABLE_NANINF
+
+// jump table for indexing into data
+#define MAX_JUMP 5
+    static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max");
+
+    template <typename Real>
+    inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
+    {
+        return std::ldexp(arg, exp);
+        // while (arg == 0)
+        // {
+        //     return arg;
+        // }
+        // while (exp > 0)
+        // {
+        //     arg *= static_cast<Real>(2);
+        //     --exp;
+        // }
+        // while (exp < 0)
+        // {
+        //     arg /= static_cast<Real>(2);
+        //     ++exp;
+        // }
+
+        // return arg;
+    }
+
+    template <class ftype>
+    struct RFA_bins
+    {
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+
+        ///The binned floating-point reference bins
+        std::array<ftype, MAXINDEX + MAXFOLD> bins = {};
+
+        constexpr ftype& operator[](int d)
+        {
+            return bins[d];
+        }
+
+        void initialize_bins()
+        {
+            if constexpr (std::is_same_v<ftype, float>)
+            {
+                bins[0] = std::ldexp(0.75, MAX_EXP);
+            }
+            else
+            {
+                bins[0] = 2.0 * std::ldexp(0.75, MAX_EXP - 1);
+            }
+
+            for (int index = 1; index <= MAXINDEX; index++)
+            {
+                bins[index] = std::ldexp(0.75,
+                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
+            }
+            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
+            {
+                bins[index] = bins[index - 1];
+            }
+        }
+    };
+
+    static char __rfa_bin_host_buffer__[sizeof(RFA_bins<double>)];
+
+    ///Class to hold a reproducible summation of the numbers passed to it
+    ///
+    ///@param ftype Floating-point data type; either `float` or `double
+    ///@param FOLD  The fold; use 3 as a default unless you understand it.
+    template <class ftype_, int FOLD_ = 3,
+        typename std::enable_if_t<std::is_floating_point<ftype_>::value>* =
+            nullptr>
+    class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator
+    {
+    public:
+        using ftype = ftype_;
+        static constexpr int FOLD = FOLD_;
+
+    private:
+        std::array<ftype, 2 * FOLD> data = {0};
+
+        ///Floating-point precision bin width
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+        ///Binned floating-point compression factor
+        ///This factor is used to scale down inputs before deposition into the bin of
+        ///highest index
+        static constexpr auto COMPRESSION =
+            1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1));
+        ///Binned double precision expansion factor
+        ///This factor is used to scale up inputs after deposition into the bin of
+        ///highest index
+        static constexpr auto EXPANSION =
+            1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1));
+        static constexpr auto EXP_BIAS = MAX_EXP - 2;
+        static constexpr auto EPSILON = std::numeric_limits<ftype>::epsilon();
+        ///Binned floating-point deposit endurance
+        ///The number of deposits that can be performed before a renorm is necessary.
+        ///Applies also to binned complex double precision.
+        static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
+
+        ///Return a binned floating-point reference bin
+        inline const ftype* binned_bins(const int x) const
+        {
+            return &reinterpret_cast<RFA_bins<ftype>&>(
+                __rfa_bin_host_buffer__)[x];
+        }
+
+        ///Get the bit representation of a float
+        static inline uint32_t& get_bits(float& x)
+        {
+            return *reinterpret_cast<uint32_t*>(&x);
+        }
+        ///Get the bit representation of a double
+        static inline uint64_t& get_bits(double& x)
+        {
+            return *reinterpret_cast<uint64_t*>(&x);
+        }
+        ///Get the bit representation of a const float
+        static inline uint32_t get_bits(const float& x)
+        {
+            return *reinterpret_cast<const uint32_t*>(&x);
+        }
+        ///Get the bit representation of a const double
+        static inline uint64_t get_bits(const double& x)
+        {
+            return *reinterpret_cast<const uint64_t*>(&x);
+        }
+
+        ///Return primary vector value const ref
+        inline const ftype& primary(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[4];
+                default:
+                    return data[FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[i];
+            }
+        }
+
+        ///Return carry vector value const ref
+        inline const ftype& carry(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[FOLD + 0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[FOLD + 1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[FOLD + 2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[FOLD + 3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[FOLD + 4];
+                default:
+                    return data[2 * FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[FOLD + i];
+            }
+        }
+
+        ///Return primary vector value ref
+        inline ftype& primary(int i)
+        {
+            const auto& c = *this;
+            return const_cast<ftype&>(c.primary(i));
+        }
+
+        ///Return carry vector value ref
+        inline ftype& carry(int i)
+        {
+            const auto& c = *this;
+            return const_cast<ftype&>(c.carry(i));
+        }
+
+#ifdef DISABLE_ZERO
+        static inline constexpr bool ISZERO(const ftype)
+        {
+            return false;
+        }
+#else
+        static inline constexpr bool ISZERO(const ftype x)
+        {
+            return x == 0.0;
+        }
+#endif
+
+#ifdef DISABLE_NANINF
+        static inline constexpr int ISNANINF(const ftype)
+        {
+            return false;
+        }
+#else
+        static inline constexpr int ISNANINF(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
+                ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
+        }
+#endif
+
+        static inline constexpr int EXP(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1);
+        }
+
+        ///Get index of float-point precision
+        ///The index of a non-binned type is the smallest index a binned type would
+        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
+        ///bins.
+        static inline constexpr int binned_dindex(const ftype x)
+        {
+            int exp = EXP(x);
+            if (exp == 0)
+            {
+                if (x == 0.0)
+                {
+                    return MAXINDEX;
+                }
+                else
+                {
+                    std::frexp(x, &exp);
+                    return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                }
+            }
+            return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
+        }
+
+        ///Get index of manually specified binned double precision
+        ///The index of a binned type is the bin that it corresponds to. Higher
+        ///indicies correspond to smaller bins.
+        inline int binned_index() const
+        {
+            return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
+                       EXP(primary(0))) /
+                BIN_WIDTH;
+        }
+
+        ///Check if index of manually specified binned floating-point is 0
+        ///A quick check to determine if the index is 0
+        inline bool binned_index0() const
+        {
+            return EXP(primary(0)) == MAX_EXP + EXP_BIAS;
+        }
+
+        ///Update manually specified binned fp with a scalar (X -> Y)
+        ///
+        ///This method updates the binned fp to an index suitable for adding numbers
+        ///with absolute value less than @p max_abs_val
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdupdate(
+            const ftype max_abs_val, const int incpriY, const int inccarY)
+        {
+            if (ISNANINF(primary(0)))
+                return;
+
+            int X_index = binned_dindex(max_abs_val);
+            if (ISZERO(primary(0)))
+            {
+                const ftype* const bins = binned_bins(X_index);
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) = bins[i];
+                    carry(i * inccarY) = 0.0;
+                }
+            }
+            else
+            {
+                int shift = binned_index() - X_index;
+                if (shift > 0)
+                {
+#pragma unroll
+                    for (int i = FOLD - 1; i >= 1; i--)
+                    {
+                        if (i < shift)
+                            break;
+                        primary(i * incpriY) = primary((i - shift) * incpriY);
+                        carry(i * inccarY) = carry((i - shift) * inccarY);
+                    }
+                    const ftype* const bins = binned_bins(X_index);
+#pragma unroll
+                    for (int j = 0; j < FOLD; j++)
+                    {
+                        if (j >= shift)
+                            break;
+                        primary(j * incpriY) = bins[j];
+                        carry(j * inccarY) = 0.0;
+                    }
+                }
+            }
+        }
+
+        ///Add scalar @p X to suitably binned manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y where the index of Y is
+        ///larger than the index of @p X
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        void binned_dmddeposit(const ftype X, const int incpriY)
+        {
+            ftype M;
+            ftype x = X;
+
+            if (ISNANINF(x) || ISNANINF(primary(0)))
+            {
+                primary(0) += x;
+                return;
+            }
+
+            if (binned_index0())
+            {
+                M = primary(0);
+                ftype qd = x * COMPRESSION;
+                auto& ql = get_bits(qd);
+                ql |= 1;
+                qd += M;
+                primary(0) = qd;
+                M -= qd;
+                M *= EXPANSION * 0.5;
+                x += M;
+                x += M;
+#pragma unroll
+                for (int i = 1; i < FOLD - 1; i++)
+                {
+                    M = primary(i * incpriY);
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    primary(i * incpriY) = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                primary((FOLD - 1) * incpriY) += qd;
+            }
+            else
+            {
+                ftype qd = x;
+                auto& ql = get_bits(qd);
+#pragma unroll
+                for (int i = 0; i < FOLD - 1; i++)
+                {
+                    M = primary(i * incpriY);
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    primary(i * incpriY) = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                primary((FOLD - 1) * incpriY) += qd;
+            }
+        }
+
+        ///Renormalize manually specified binned double precision
+        ///
+        ///Renormalization keeps the primary vector within the necessary bins by
+        ///shifting over to the carry vector
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        inline void binned_dmrenorm(const int incpriX, const int inccarX)
+        {
+            if (ISZERO(primary(0)) || ISNANINF(primary(0)))
+                return;
+
+            for (int i = 0; i < FOLD; i++)
+            {
+                auto tmp_renormd = primary(i * incpriX);
+                auto& tmp_renorml = get_bits(tmp_renormd);
+
+                carry(i * inccarX) +=
+                    (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
+
+                tmp_renorml &= ~(1ull << (MANT_DIG - 3));
+                tmp_renorml |= 1ull << (MANT_DIG - 2);
+                primary(i * incpriX) = tmp_renormd;
+            }
+        }
+
+        ///Add scalar to manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdadd(const ftype X, const int incpriY, const int inccarY)
+        {
+            binned_dmdupdate(X, incpriY, inccarY);
+            binned_dmddeposit(X, incpriY);
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Convert manually specified binned fp to native double-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        double binned_conv_double(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
+                return 0.0;
+
+            double Y = 0.0;
+            double scale_down;
+            double scale_up;
+            int scaled;
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
+            {
+                scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
+                scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
+                scaled = std::max(
+                    std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
+                if (X_index == 0)
+                {
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
+                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
+                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
+                    i = 2;
+                }
+                else
+                {
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
+                    i = 1;
+                }
+                for (; i < scaled; i++)
+                {
+                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
+                    Y +=
+                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
+                }
+                if (i == FOLD)
+                {
+                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
+                        scale_down;
+                    return Y * scale_up;
+                }
+                if (std::isinf(Y * scale_up))
+                {
+                    return Y * scale_up;
+                }
+                Y *= scale_up;
+                for (; i < FOLD; i++)
+                {
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += primary((i - 1) * incpriX) - bins[i - 1];
+                }
+                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
+            }
+            else
+            {
+                Y += carry(0) * (bins[0] / 6.0);
+                for (i = 1; i < FOLD; i++)
+                {
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
+                }
+                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+            }
+            return Y;
+        }
+
+        ///Convert manually specified binned fp to native single-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        float binned_conv_single(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+            double Y = 0.0;
+
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
+                return 0.0;
+
+            //Note that the following order of summation is in order of decreasing
+            //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
+            //the number of carries equal to 1.
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index == 0)
+            {
+                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
+                    (double) EXPANSION;
+                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
+                Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
+                i = 2;
+            }
+            else
+            {
+                Y += (double) carry(0) * (double) (bins[0] / 6.0);
+                i = 1;
+            }
+            for (; i < FOLD; i++)
+            {
+                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
+                Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
+            }
+            Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+
+            return (float) Y;
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        ///
+        ///@param other   Another binned fp of the same type
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdmadd(const ReproducibleFloatingAccumulator& x,
+            const int incpriX, const int inccarX, const int incpriY,
+            const int inccarY)
+        {
+            if (ISZERO(x.primary(0)))
+                return;
+
+            if (ISZERO(primary(0)))
+            {
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
+                }
+                return;
+            }
+
+            if (ISNANINF(x.primary(0)) || ISNANINF(primary(0)))
+            {
+                primary(0) += x.primary(0);
+                return;
+            }
+
+            const auto X_index = x.binned_index();
+            const auto Y_index = this->binned_index();
+            const auto shift = Y_index - X_index;
+            if (shift > 0)
+            {
+                const auto* const bins = binned_bins(Y_index);
+                //shift Y upwards and add X to Y
+#pragma unroll
+                for (int i = FOLD - 1; i >= 1; i--)
+                {
+                    if (i < shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX) +
+                        (primary((i - shift) * incpriY) - bins[i - shift]);
+                    carry(i * inccarY) =
+                        x.carry(i * inccarX) + carry((i - shift) * inccarY);
+                }
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    if (i == shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
+                }
+            }
+            else if (shift < 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+                //shift X upwards and add X to Y
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    if (i < -shift)
+                        continue;
+                    primary(i * incpriY) +=
+                        x.primary((i + shift) * incpriX) - bins[i + shift];
+                    carry(i * inccarY) += x.carry((i + shift) * inccarX);
+                }
+            }
+            else if (shift == 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+                // add X to Y
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
+                    carry(i * inccarY) += x.carry(i * inccarX);
+                }
+            }
+
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        void binned_dbdbadd(const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dmdmadd(other, 1, 1, 1, 1);
+        }
+
+    public:
+        ReproducibleFloatingAccumulator() = default;
+        ReproducibleFloatingAccumulator(
+            const ReproducibleFloatingAccumulator&) = default;
+        ///Sets this binned fp equal to another binned fp
+        ReproducibleFloatingAccumulator& operator=(
+            const ReproducibleFloatingAccumulator&) = default;
+
+        ///Set the binned fp to zero
+        void zero()
+        {
+            data = {0};
+        }
+
+        ///Return the fold of the binned fp
+        constexpr int fold() const
+        {
+            return FOLD;
+        }
+
+        ///Return the endurance of the binned fp
+        constexpr int endurance() const
+        {
+            return ENDURANCE;
+        }
+
+        ///Returns the number of reference bins. Used for judging memory usage.
+        constexpr size_t number_of_reference_bins()
+        {
+            return std::array<ftype, MAXINDEX + MAXFOLD>::size();
+        }
+
+        ///Accumulate an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator+=(const U x)
+        {
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate-subtract an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator-=(const U x)
+        {
+            binned_dmdadd(-static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate a binned fp @p x into the binned fp.
+        ReproducibleFloatingAccumulator& operator+=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dbdbadd(other);
+            return *this;
+        }
+
+        ///Accumulate-subtract a binned fp @p x into the binned fp.
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator& operator-=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            const auto temp = -other;
+            binned_dbdbadd(temp);
+        }
+
+        ///Determines if two binned fp are equal
+        bool operator==(const ReproducibleFloatingAccumulator& other) const
+        {
+            return data == other.data;
+        }
+
+        ///Determines if two binned fp are not equal
+        bool operator!=(const ReproducibleFloatingAccumulator& other) const
+        {
+            return !operator==(other);
+        }
+
+        ///Sets this binned fp equal to the arithmetic value @p x
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator=(const U x)
+        {
+            zero();
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Returns the negative of this binned fp
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator operator-()
+        {
+            constexpr int incpriX = 1;
+            constexpr int inccarX = 1;
+            ReproducibleFloatingAccumulator temp = *this;
+            if (primary(0) != 0.0)
+            {
+                const auto* const bins = binned_bins(binned_index());
+                for (int i = 0; i < FOLD; i++)
+                {
+                    temp.primary(i * incpriX) =
+                        bins[i] - (primary(i * incpriX) - bins[i]);
+                    temp.carry(i * inccarX) = -carry(i * inccarX);
+                }
+            }
+            return temp;
+        }
+
+        ///Convert this binned fp into its native floating-point representation
+        ftype conv() const
+        {
+            if (std::is_same_v<ftype, float>)
+            {
+                return binned_conv_single(1, 1);
+            }
+            else
+            {
+                return binned_conv_double(1, 1);
+            }
+        }
+
+        ///@brief Get binned fp summation error bound
+        ///
+        ///This is a bound on the absolute error of a summation using binned types
+        ///
+        ///@param N           The number of single precision floating point summands
+        ///@param max_abs_val The summand of maximum absolute value
+        ///@param binned_sum  The value of the sum computed using binned types
+        ///@return            The absolute error bound
+        static constexpr ftype error_bound(
+            const uint64_t N, const ftype max_abs_val, const ftype binned_sum)
+        {
+            const double X = std::abs(max_abs_val);
+            const double S = std::abs(binned_sum);
+            return static_cast<ftype>(max(X, std::ldexp(0.5, MIN_EXP - 1)) *
+                    std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
+                ((7.0 * EPSILON) /
+                    (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
+                        7.0 * EPSILON)) *
+                    S);
+        }
+
+        ///Add @p x to the binned fp
+        void add(const ftype x)
+        {
+            binned_dmdadd(x, 1, 1);
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last, const ftype max_abs_val)
+        {
+            binned_dmdupdate(std::abs(max_abs_val), 1, 1);
+            size_t count = 0;
+            size_t N = last - first;
+            for (; first != last; first++, count++)
+            {
+                binned_dmddeposit(static_cast<ftype>(*first), 1);
+                // first conditional allows compiler to remove the call here when possible
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last)
+        {
+            const auto max_abs_val = *std::max_element(
+                first, last, [](const auto& a, const auto& b) {
+                    return std::abs(a) < std::abs(b);
+                });
+            add(first, last, static_cast<ftype>(max_abs_val));
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void add(const T* input, const size_t N, const ftype max_abs_val)
+        {
+            if (N == 0)
+                return;
+            add(input, input + N, max_abs_val);
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void add(const T* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            T max_abs_val = input[0];
+            for (size_t i = 0; i < N; i++)
+            {
+                max_abs_val = max(max_abs_val, std::abs(input[i]));
+            }
+            add(input, N, max_abs_val);
+        }
+
+        ///Accumulate a float4 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float4& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            binned_dmddeposit(static_cast<ftype>(x.z), 1);
+            binned_dmddeposit(static_cast<ftype>(x.w), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const double2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        void add(const float4* input, const size_t N, float max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].z), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].w), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const double2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const float2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const float4* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const double2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const float2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        //////////////////////////////////////
+        //MANUAL OPERATIONS; USE WISELY
+        //////////////////////////////////////
+
+        ///Rebins for repeated accumulation of scalars with magnitude <= @p mav
+        ///
+        ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator
+        ///with `unsafe_add` after which `renorm()` must be called. See the source of
+        ///`add()` for an example
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void set_max_abs_val(const T mav)
+        {
+            binned_dmdupdate(std::abs(mav), 1, 1);
+        }
+
+        ///Add @p x to the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()`
+        void unsafe_add(const ftype x)
+        {
+            binned_dmddeposit(x, 1);
+        }
+
+        ///Renormalizes the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()` and one or
+        ///more calls to `unsafe_add()`
+        void renorm()
+        {
+            binned_dmrenorm(1, 1);
+        }
+    };
+
+}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
new file mode 100644
index 00000000000..9920d3bd7a6
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -0,0 +1,537 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+/// \file parallel/algorithms/reduce_deterministic.hpp
+/// \page hpx::reduce_deterministic
+/// \headerfile hpx/algorithm.hpp
+
+#pragma once
+
+#if defined(DOXYGEN)
+
+namespace hpx {
+
+    // clang-format off
+
+    /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         predicate \a f.
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    /// \tparam F           The type of the function/function object to use
+    ///                     (deduced). Unlike its sequential form, the parallel
+    ///                     overload of \a reduce requires \a F to meet the
+    ///                     requirements of \a CopyConstructible.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    /// \param f            Specifies the function (or function object) which
+    ///                     will be invoked for each of the elements in the
+    ///                     sequence specified by [first, last). This is a
+    ///                     binary predicate. The signature of this predicate
+    ///                     should be equivalent to:
+    ///                     \code
+    ///                     Ret fun(const Type1 &a, const Type1 &b);
+    ///                     \endcode \n
+    ///                     The signature does not need to have const&.
+    ///                     The types \a Type1 \a Ret must be
+    ///                     such that an object of type \a FwdIter can be
+    ///                     dereferenced and then implicitly converted to any
+    ///                     of those types.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a copy_if algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns \a T otherwise.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum over the elements given by the input range
+    ///           [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(op, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter, typename F,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init, F&& f);
+
+    /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a copy_if algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns \a T otherwise.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    util::detail::algorithm_result_t<ExPolicy, T>
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init);
+
+    /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam ExPolicy    The type of the execution policy to use (deduced).
+    ///                     It describes the manner in which the execution
+    ///                     of the algorithm may be parallelized and the manner
+    ///                     in which it executes the assignments.
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of a
+    ///                     forward iterator.
+    ///
+    /// \param policy       The execution policy to use for the scheduling of
+    ///                     the iterations.
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a sequenced_policy
+    /// execute in sequential order in the calling thread.
+    ///
+    /// The reduce operations in the parallel \a reduce algorithm invoked
+    /// with an execution policy object of type \a parallel_policy
+    /// or \a parallel_task_policy are permitted to execute in an unordered
+    /// fashion in unspecified threads, and indeterminately sequenced
+    /// within each thread.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a hpx::future<T> if the
+    ///           execution policy is of type
+    ///           \a sequenced_task_policy or
+    ///           \a parallel_task_policy and
+    ///           returns T otherwise (where T is the value_type of
+    ///           \a FwdIter).
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   The type of the initial value (and the result type) \a T is
+    ///         determined from the value_type of the used \a FwdIter.
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename ExPolicy, typename FwdIter>
+    typename hpx::parallel::util::detail::algorithm_result<ExPolicy,
+        typename std::iterator_traits<FwdIter>::value_type
+    >::type
+    reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last);
+
+    /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         predicate \a f.
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    /// \tparam F           The type of the function/function object to use
+    ///                     (deduced). Unlike its sequential form, the parallel
+    ///                     overload of \a reduce requires \a F to meet the
+    ///                     requirements of \a CopyConstructible.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    /// \param f            Specifies the function (or function object) which
+    ///                     will be invoked for each of the elements in the
+    ///                     sequence specified by [first, last). This is a
+    ///                     binary predicate. The signature of this predicate
+    ///                     should be equivalent to:
+    ///                     \code
+    ///                     Ret fun(const Type1 &a, const Type1 &b);
+    ///                     \endcode \n
+    ///                     The signature does not need to have const&.
+    ///                     The types \a Type1 \a Ret must be
+    ///                     such that an object of type \a InIter can be
+    ///                     dereferenced and then implicitly converted to any
+    ///                     of those types.
+    ///
+    /// \returns  The \a reduce algorithm returns \a T.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum over the elements given by the input range
+    ///           [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(op, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter, typename F,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    T reduce_deterministic(FwdIter first, FwdIter last, T init, F&& f);
+
+    /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    /// \tparam T           The type of the value to be used as initial (and
+    ///                     intermediate) values (deduced).
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    /// \param init         The initial value for the generalized sum.
+    ///
+    /// \returns  The \a reduce algorithm returns a \a T.
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter,
+        typename T = typename std::iterator_traits<FwdIter>::value_type>
+    T reduce_deterministic(FwdIter first, FwdIter last, T init);
+
+    /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)).
+    /// Executed according to the policy.
+    ///
+    /// \note   Complexity: O(\a last - \a first) applications of the
+    ///         operator+().
+    ///
+    /// \tparam FwdIter     The type of the source begin and end iterators used
+    ///                     (deduced).
+    ///                     This iterator type must meet the requirements of an
+    ///                     input iterator.
+    ///
+    /// \param first        Refers to the beginning of the sequence of elements
+    ///                     the algorithm will be applied to.
+    /// \param last         Refers to the end of the sequence of elements the
+    ///                     algorithm will be applied to.
+    ///
+    /// \returns  The \a reduce algorithm returns \a T (where T is the
+    ///           value_type of \a FwdIter).
+    ///           The \a reduce algorithm returns the result of the
+    ///           generalized sum (applying operator+()) over the elements given
+    ///           by the input range [first, last).
+    ///
+    /// \note   The type of the initial value (and the result type) \a T is
+    ///         determined from the value_type of the used \a FwdIter.
+    ///
+    /// \note   GENERALIZED_SUM(+, a1, ..., aN) is defined as follows:
+    ///         * a1 when N is 1
+    ///         * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)),
+    ///           where:
+    ///           * b1, ..., bN may be any permutation of a1, ..., aN and
+    ///           * 1 < K+1 = M <= N.
+    ///
+    /// The difference between \a reduce and \a accumulate is
+    /// that the behavior of reduce may be non-deterministic for
+    /// non-associative or non-commutative binary predicate.
+    ///
+    template <typename FwdIter>
+    typename std::iterator_traits<FwdIter>::value_type
+    reduce_deterministic(FwdIter first, FwdIter last);
+    // clang-format on
+}    // namespace hpx
+
+#else    // DOXYGEN
+
+#include <hpx/config.hpp>
+#include <hpx/concepts/concepts.hpp>
+#include <hpx/executors/execution_policy.hpp>
+#include <hpx/iterator_support/range.hpp>
+#include <hpx/pack_traversal/unwrap.hpp>
+#include <hpx/parallel/algorithms/detail/accumulate.hpp>
+#include <hpx/parallel/algorithms/detail/dispatch.hpp>
+#include <hpx/parallel/algorithms/detail/distance.hpp>
+#include <hpx/parallel/algorithms/detail/reduce.hpp>
+#include <hpx/parallel/algorithms/detail/reduce_deterministic.hpp>
+#include <hpx/parallel/util/detail/algorithm_result.hpp>
+#include <hpx/parallel/util/detail/sender_util.hpp>
+#include <hpx/parallel/util/loop.hpp>
+#include <hpx/parallel/util/partitioner.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+namespace hpx::parallel {
+
+    ///////////////////////////////////////////////////////////////////////////
+    // reduce
+    namespace detail {
+
+        /// \cond NOINTERNAL
+        template <typename T>
+        struct reduce_deterministic
+          : public algorithm<reduce_deterministic<T>, T>
+        {
+            constexpr reduce_deterministic() noexcept
+              : algorithm<reduce_deterministic, T>("reduce_deterministic")
+            {
+            }
+
+            template <typename ExPolicy, typename InIterB, typename InIterE,
+                typename T_, typename Reduce>
+            static constexpr T sequential(ExPolicy&& policy, InIterB first,
+                InIterE last, T_&& init, Reduce&& r)
+            {
+                return hpx::parallel::detail::sequential_reduce_deterministic<
+                    ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last,
+                    HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r));
+            }
+        };
+        /// \endcond
+    }    // namespace detail
+}    // namespace hpx::parallel
+
+namespace hpx {
+
+    ///////////////////////////////////////////////////////////////////////////
+    // CPO for hpx::reduce
+    inline constexpr struct reduce_deterministic_t final
+      : hpx::detail::tag_parallel_algorithm<reduce_deterministic_t>
+    {
+    private:
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter, typename F,
+            typename T = typename std::iterator_traits<FwdIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last, T init, F f)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init),
+                HPX_MOVE(f));
+        }
+
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter,
+            typename T = typename std::iterator_traits<FwdIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy, T>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last, T init)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init),
+                std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename ExPolicy, typename FwdIter,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::is_execution_policy_v<ExPolicy> &&
+                hpx::traits::is_iterator_v<FwdIter>
+            )>
+        // clang-format on
+        friend hpx::parallel::util::detail::algorithm_result_t<ExPolicy,
+            typename std::iterator_traits<FwdIter>::value_type>
+        tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy,
+            FwdIter first, FwdIter last)
+        {
+            static_assert(hpx::traits::is_forward_iterator_v<FwdIter>,
+                "Requires at least forward iterator.");
+
+            using value_type =
+                typename std::iterator_traits<FwdIter>::value_type;
+
+            return hpx::parallel::detail::reduce_deterministic<value_type>()
+                .call(HPX_FORWARD(ExPolicy, policy), first, last, value_type{},
+                    std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename InIter, typename F,
+            typename T = typename std::iterator_traits<InIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend T tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last, T init, F f)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                hpx::execution::seq, first, last, HPX_MOVE(init), HPX_MOVE(f));
+        }
+
+        // clang-format off
+        template <typename InIter,
+            typename T = typename std::iterator_traits<InIter>::value_type,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend T tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last, T init)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            return hpx::parallel::detail::reduce_deterministic<T>().call(
+                hpx::execution::seq, first, last, HPX_MOVE(init),
+                std::plus<>{});
+        }
+
+        // clang-format off
+        template <typename InIter,
+            HPX_CONCEPT_REQUIRES_(
+                hpx::traits::is_iterator_v<InIter>
+            )>
+        // clang-format on
+        friend typename std::iterator_traits<InIter>::value_type
+        tag_fallback_invoke(
+            hpx::reduce_deterministic_t, InIter first, InIter last)
+        {
+            static_assert(hpx::traits::is_input_iterator_v<InIter>,
+                "Requires at least input iterator.");
+
+            using value_type =
+                typename std::iterator_traits<InIter>::value_type;
+
+            return hpx::parallel::detail::reduce_deterministic<value_type>()
+                .call(hpx::execution::seq, first, last, value_type{},
+                    std::plus<>());
+        }
+    } reduce_deterministic{};
+}    // namespace hpx
+
+#endif    // DOXYGEN
diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
index 7d3a9204530..559ee830030 100644
--- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
@@ -94,6 +94,7 @@ set(tests
     partition_copy
     reduce_
     reduce_by_key
+    reduce_deterministic
     remove
     remove1
     remove2
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
new file mode 100644
index 00000000000..92dd2e7f3dc
--- /dev/null
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -0,0 +1,272 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <hpx/init.hpp>
+#include <hpx/modules/testing.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
+#include <hpx/parallel/algorithms/reduce.hpp>
+#include <hpx/parallel/algorithms/reduce_deterministic.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include "test_utils.hpp"
+
+int seed = std::random_device{}();
+std::mt19937 gen(seed);
+
+template <typename T>
+T get_rand(
+    T LO = std::numeric_limits<T>::min(), T HI = std::numeric_limits<T>::max())
+{
+    return LO +
+        static_cast<T>(std::rand()) / (static_cast<T>(RAND_MAX / (HI - LO)));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename IteratorTag, typename FloatTypeDeterministic,
+    typename FloatTypeNonDeterministic, size_t LEN = 10007>
+void test_reduce1(IteratorTag)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result i.e. correct computation
+    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
+
+    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
+
+    std::vector<FloatTypeDeterministic> deterministic(LEN);
+    std::vector<FloatTypeNonDeterministic> nondeterministic(LEN);
+
+    std::iota(
+        deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0));
+
+    std::iota(nondeterministic.begin(), nondeterministic.end(),
+        FloatTypeNonDeterministic(0));
+
+    FloatTypeDeterministic val_det(0);
+    FloatTypeNonDeterministic val_non_det(0);
+    auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeDeterministic r1 =
+        hpx::reduce_deterministic(iterator_det(std::begin(deterministic)),
+            iterator_det(std::end(deterministic)), val_det, op);
+
+    // verify values
+    FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic)),
+        iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+
+    FloatTypeNonDeterministic r3 = std::accumulate(
+        nondeterministic.begin(), nondeterministic.end(), val_non_det);
+
+    HPX_TEST_EQ(r1, r3);
+    HPX_TEST_EQ(r2, r3);
+}
+
+template <typename IteratorTag, typename FloatTypeDeterministic,
+    size_t LEN = 10007>
+void test_reduce_determinism(IteratorTag)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result
+    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
+
+    constexpr FloatTypeDeterministic num_bounds_det =
+        std::is_same_v<FloatTypeDeterministic, float> ? 1000.0 : 1000000.0;
+
+    std::vector<FloatTypeDeterministic> deterministic(LEN);
+
+    for (size_t i = 0; i < LEN; ++i)
+    {
+        deterministic[i] =
+            get_rand<FloatTypeDeterministic>(-num_bounds_det, num_bounds_det);
+    }
+
+    std::vector<FloatTypeDeterministic> deterministic_shuffled = deterministic;
+
+    std::shuffle(
+        deterministic_shuffled.begin(), deterministic_shuffled.end(), gen);
+
+    FloatTypeDeterministic val_det(41.999);
+
+    auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeDeterministic r1 =
+        hpx::reduce_deterministic(iterator_det(std::begin(deterministic)),
+            iterator_det(std::end(deterministic)), val_det, op);
+
+    FloatTypeDeterministic r1_shuffled = hpx::reduce_deterministic(
+        iterator_det(std::begin(deterministic_shuffled)),
+        iterator_det(std::end(deterministic_shuffled)), val_det, op);
+
+    HPX_TEST_EQ(r1,
+        r1_shuffled);    // Deterministically calculated, should always satisfy
+}
+
+/// This test function is never called because it is not guaranteed to pass
+/// It serves an important purpose to demonstrate that floating point summation
+/// is not always associative i.e. a+b+c != a+c+b
+template <typename IteratorTag, typename FloatTypeNonDeterministic,
+    size_t LEN = 10007>
+void test_orig_reduce_determinism(IteratorTag)
+{
+    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
+
+    constexpr auto num_bounds_ndet =
+        std::is_same_v<FloatTypeNonDeterministic, float> ? 1000.0f : 1000000.0f;
+
+    std::vector<FloatTypeNonDeterministic> nondeterministic(LEN);
+    for (size_t i = 0; i < LEN; ++i)
+    {
+        nondeterministic[i] = get_rand<FloatTypeNonDeterministic>(
+            -num_bounds_ndet, num_bounds_ndet);
+    }
+    std::vector<FloatTypeNonDeterministic> nondeterministic_shuffled =
+        nondeterministic;
+    std::shuffle(nondeterministic_shuffled.begin(),
+        nondeterministic_shuffled.end(), gen);
+
+    FloatTypeNonDeterministic val_non_det(41.999);
+
+    auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic)),
+        iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+    FloatTypeNonDeterministic r2_shuffled = hpx::reduce(hpx::execution::seq,
+        iterator_ndet(std::begin(nondeterministic_shuffled)),
+        iterator_ndet(std::end(nondeterministic_shuffled)), val_non_det, op);
+
+    FloatTypeNonDeterministic r3 = std::accumulate(
+        nondeterministic.begin(), nondeterministic.end(), val_non_det);
+    FloatTypeNonDeterministic r3_shuffled =
+        std::accumulate(nondeterministic_shuffled.begin(),
+            nondeterministic_shuffled.end(), val_non_det);
+
+    /// failed around 131 times out of 1000 on macOS arm
+    /// Floating point addition is not necessarily associative,
+    /// might fail on an architecture not yet known with much higher precision
+    HPX_TEST_NEQ(r2, r2_shuffled);
+    HPX_TEST_NEQ(r3, r3_shuffled);
+}
+
+template <typename IteratorTag>
+void test_reduce1()
+{
+    using namespace hpx::execution;
+
+    test_reduce1<IteratorTag, float, float, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, double, float, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
+}
+
+template <typename IteratorTag>
+void test_reduce2()
+{
+    using namespace hpx::execution;
+
+    test_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
+    test_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
+}
+
+// template <typename IteratorTag>
+// void test_reduce3()
+// {
+//     using namespace hpx::execution;
+
+//     test_orig_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
+//     test_orig_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
+// }
+
+void reduce_test1()
+{
+    test_reduce1<std::random_access_iterator_tag>();
+    test_reduce2<std::random_access_iterator_tag>();
+    // test_reduce3<std::random_access_iterator_tag>();
+    test_reduce1<std::forward_iterator_tag>();
+    test_reduce2<std::forward_iterator_tag>();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+int hpx_main(hpx::program_options::variables_map& vm)
+{
+    unsigned int seed = (unsigned int) std::time(nullptr);
+    bool seed_random = false;
+
+    if (vm.count("seed"))
+    {
+        seed = vm["seed"].as<unsigned int>();
+        seed_random = true;
+    }
+
+    if (vm.count("seed-random"))
+        seed_random = vm["seed-random"].as<bool>();
+
+    if (seed_random)
+    {
+        std::cout << "using seed: " << seed << std::endl;
+        std::cout << "** std::accumulate, hpx::reduce may fail due to "
+                     "non-determinism of the floating summation"
+                  << std::endl;
+        gen.seed(seed);
+        std::srand(seed);
+    }
+    else
+    {
+        gen.seed(223);
+        std::srand(223);
+    }
+
+    reduce_test1();
+
+    return hpx::local::finalize();
+}
+
+int main(int argc, char* argv[])
+{
+    // add command line option which controls the random number generator seed
+    using namespace hpx::program_options;
+    options_description desc_commandline(
+        "Usage: " HPX_APPLICATION_STRING " [options]");
+
+    desc_commandline.add_options()("seed,s", value<unsigned int>(),
+        "the random number generator seed to use for this run");
+    desc_commandline.add_options()("seed-random", value<bool>(),
+        "switch for the random number generator seed to use for this run");
+
+    // By default this test should run on all available cores
+    std::vector<std::string> const cfg = {"hpx.os_threads=all"};
+
+    // Initialize and run HPX
+    hpx::local::init_params init_args;
+    init_args.desc_cmdline = desc_commandline;
+    init_args.cfg = cfg;
+
+    HPX_TEST_EQ_MSG(hpx::local::init(hpx_main, argc, argv, init_args), 0,
+        "HPX main exited with non-zero status");
+
+    return hpx::util::report_errors();
+}

From 8b0ce4cd5e34598840e5f28851341de4d8d01213 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 17 Dec 2024 14:22:55 -0500
Subject: [PATCH 10/10] Address inspect tool, check module cmakelists, warnings
 and spell check

- missing includes
- prevent max/min being expanded as macros
- minor spell check correction
- remove pragma once in cpp file
- resolve implicit type conversions in rfa type to single and double and other places
- add dual license
- remove unnecessary command for macos ci
- use HPX_UNROLL instead of vanilla pragma
- clang-17 cannot unroll so use checks
- add typename qualifier for iterator type

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .github/workflows/macos_debug_fetch_hwloc.yml |   1 -
 libs/core/algorithms/CMakeLists.txt           |   3 +
 .../detail/reduce_deterministic.hpp           |   3 +
 .../hpx/parallel/algorithms/detail/rfa.hpp    | 159 ++++++++++++------
 .../unit/algorithms/reduce_deterministic.cpp  |  26 +--
 5 files changed, 133 insertions(+), 59 deletions(-)

diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml
index f778a6b117d..caec2af3ac2 100644
--- a/.github/workflows/macos_debug_fetch_hwloc.yml
+++ b/.github/workflows/macos_debug_fetch_hwloc.yml
@@ -19,7 +19,6 @@ jobs:
       run: |
           brew install --overwrite python-tk && \
           brew install --overwrite boost gperftools ninja autoconf automake && \
-          autoreconf -f -i \
           brew upgrade cmake
     - name: Configure
       shell: bash
diff --git a/libs/core/algorithms/CMakeLists.txt b/libs/core/algorithms/CMakeLists.txt
index 6fcfed897e2..9090345722d 100644
--- a/libs/core/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/CMakeLists.txt
@@ -37,7 +37,9 @@ set(algorithms_headers
     hpx/parallel/algorithms/detail/parallel_stable_sort.hpp
     hpx/parallel/algorithms/detail/pivot.hpp
     hpx/parallel/algorithms/detail/reduce.hpp
+    hpx/parallel/algorithms/detail/reduce_deterministic.hpp
     hpx/parallel/algorithms/detail/replace.hpp
+    hpx/parallel/algorithms/detail/rfa.hpp
     hpx/parallel/algorithms/detail/rotate.hpp
     hpx/parallel/algorithms/detail/sample_sort.hpp
     hpx/parallel/algorithms/detail/search.hpp
@@ -72,6 +74,7 @@ set(algorithms_headers
     hpx/parallel/algorithms/partition.hpp
     hpx/parallel/algorithms/reduce_by_key.hpp
     hpx/parallel/algorithms/reduce.hpp
+    hpx/parallel/algorithms/reduce_deterministic.hpp
     hpx/parallel/algorithms/remove_copy.hpp
     hpx/parallel/algorithms/remove.hpp
     hpx/parallel/algorithms/replace.hpp
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index 87069858f49..b3773088917 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -13,6 +13,7 @@
 #include <hpx/parallel/util/loop.hpp>
 
 #include <cstddef>
+#include <cstring>
 #include <limits>
 #include <type_traits>
 #include <utility>
@@ -32,6 +33,8 @@ namespace hpx::parallel::detail {
             sequential_reduce_deterministic_t, ExPolicy&&, InIterB first,
             InIterE last, T init, Reduce&& r)
         {
+            /// TODO: Put constraint on Reduce to be a binary plus operator
+            (void) r;
             hpx::parallel::detail::rfa::RFA_bins<T> bins;
             bins.initialize_bins();
             std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
index 302f823fab7..fa9142cdf80 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -1,3 +1,34 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// ---------------------------------------------------------------------------
+// This file has been taken from
+// https://github.com/maddyscientist/reproducible_floating_sums commit
+// b5a065741d4ea459437ca004b508de9dcb6a3e52. The boost copyright has been added
+// to this file in accordance with the dual license terms for the Reproducible
+// Floating-Point Summations and conformance with the HPX policy
+// https://github.com/maddyscientist/reproducible_floating_sums/blob/feature/cuda/LICENSE.md
+// ---------------------------------------------------------------------------
+//
+/// Copyright 2022 Richard Barnes, Peter Ahrens, James Demmel
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to deal
+/// in the Software without restriction, including without limitation the rights
+/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+/// copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+/// The above copyright notice and this permission notice shall be included in
+/// all copies or substantial portions of the Software.
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
 //Reproducible Floating Point Accumulations via Binned Floating Point
 //Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
 //ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
@@ -26,6 +57,10 @@
 #include <cmath>
 #include <cstdint>
 #include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <hpx/config.hpp>
 
 namespace hpx::parallel::detail::rfa {
     template <typename F>
@@ -179,7 +214,7 @@ namespace hpx::parallel::detail::rfa {
         static constexpr int FOLD = FOLD_;
 
     private:
-        std::array<ftype, 2 * FOLD> data = {0};
+        std::array<ftype, 2 * FOLD> data = {{0}};
 
         ///Floating-point precision bin width
         static constexpr auto BIN_WIDTH =
@@ -351,21 +386,21 @@ namespace hpx::parallel::detail::rfa {
 
         ///Get index of float-point precision
         ///The index of a non-binned type is the smallest index a binned type would
-        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
+        ///need to have to sum it reproducibly. Higher indices correspond to smaller
         ///bins.
         static inline constexpr int binned_dindex(const ftype x)
         {
             int exp = EXP(x);
             if (exp == 0)
             {
-                if (x == 0.0)
+                if (x == static_cast<ftype>(0.0))
                 {
                     return MAXINDEX;
                 }
                 else
                 {
                     std::frexp(x, &exp);
-                    return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                    return (std::max)((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
                 }
             }
             return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
@@ -373,7 +408,7 @@ namespace hpx::parallel::detail::rfa {
 
         ///Get index of manually specified binned double precision
         ///The index of a binned type is the bin that it corresponds to. Higher
-        ///indicies correspond to smaller bins.
+        ///indices correspond to smaller bins.
         inline int binned_index() const
         {
             return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
@@ -416,7 +451,9 @@ namespace hpx::parallel::detail::rfa {
                 int shift = binned_index() - X_index;
                 if (shift > 0)
                 {
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                    HPX_UNROLL
+#endif
                     for (int i = FOLD - 1; i >= 1; i--)
                     {
                         if (i < shift)
@@ -425,7 +462,9 @@ namespace hpx::parallel::detail::rfa {
                         carry(i * inccarY) = carry((i - shift) * inccarY);
                     }
                     const ftype* const bins = binned_bins(X_index);
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                    HPX_UNROLL
+#endif
                     for (int j = 0; j < FOLD; j++)
                     {
                         if (j >= shift)
@@ -457,16 +496,19 @@ namespace hpx::parallel::detail::rfa {
             if (binned_index0())
             {
                 M = primary(0);
-                ftype qd = x * COMPRESSION;
+                ftype qd = x * static_cast<ftype>(COMPRESSION);
                 auto& ql = get_bits(qd);
                 ql |= 1;
                 qd += M;
                 primary(0) = qd;
                 M -= qd;
-                M *= EXPANSION * 0.5;
+                auto temp_m = (double) (((double) EXPANSION) * 0.5);
+                M *= static_cast<ftype>(temp_m);
                 x += M;
                 x += M;
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 1; i < FOLD - 1; i++)
                 {
                     M = primary(i * incpriY);
@@ -485,7 +527,9 @@ namespace hpx::parallel::detail::rfa {
             {
                 ftype qd = x;
                 auto& ql = get_bits(qd);
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD - 1; i++)
                 {
                     M = primary(i * incpriY);
@@ -550,7 +594,7 @@ namespace hpx::parallel::detail::rfa {
             int i = 0;
 
             if (ISNANINF(primary(0)))
-                return primary(0);
+                return (double) primary(0);
             if (ISZERO(primary(0)))
                 return 0.0;
 
@@ -564,29 +608,36 @@ namespace hpx::parallel::detail::rfa {
             {
                 scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
                 scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
-                scaled = std::max(
-                    std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
+                scaled = (std::max)(
+                    (std::min)(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
                 if (X_index == 0)
                 {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
-                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
-                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
+                    Y += ((double) carry(0)) *
+                        ((((double) bins[0]) / 6.0) * scale_down * EXPANSION);
+                    Y += ((double) carry(inccarX)) *
+                        ((((double) bins[1]) / 6.0) * scale_down);
+                    Y += ((double) primary(0) - (double) bins[0]) * scale_down *
+                        EXPANSION;
                     i = 2;
                 }
                 else
                 {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
+                    Y += ((double) carry(0)) *
+                        (((double) bins[0] / 6.0) * scale_down);
                     i = 1;
                 }
                 for (; i < scaled; i++)
                 {
-                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
-                    Y +=
-                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
+                    Y += ((double) carry(i * inccarX)) *
+                        (((double) bins[i] / 6.0) * scale_down);
+                    Y += ((double) primary((i - 1) * incpriX) -
+                             (double) (bins[i - 1])) *
+                        scale_down;
                 }
                 if (i == FOLD)
                 {
-                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
+                    Y += ((double) primary((FOLD - 1) * incpriX) -
+                             (double) (bins[FOLD - 1])) *
                         scale_down;
                     return Y * scale_up;
                 }
@@ -597,20 +648,23 @@ namespace hpx::parallel::detail::rfa {
                 Y *= scale_up;
                 for (; i < FOLD; i++)
                 {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += primary((i - 1) * incpriX) - bins[i - 1];
+                    Y += ((double) carry(i * inccarX)) *
+                        ((double) bins[i] / 6.0);
+                    Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
                 }
-                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
+                Y += ((double) primary((FOLD - 1) * incpriX) -
+                    ((double) bins[FOLD - 1]));
             }
             else
             {
-                Y += carry(0) * (bins[0] / 6.0);
+                Y += ((double) carry(0)) * ((double) bins[0] / 6.0);
                 for (i = 1; i < FOLD; i++)
                 {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
+                    Y += ((double) carry(i * inccarX)) *
+                        ((double) bins[i] / 6.0);
+                    Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
                 }
-                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+                Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
             }
             return Y;
         }
@@ -627,7 +681,7 @@ namespace hpx::parallel::detail::rfa {
             if (ISNANINF(primary(0)))
                 return primary(0);
             if (ISZERO(primary(0)))
-                return 0.0;
+                return 0.0f;
 
             //Note that the following order of summation is in order of decreasing
             //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
@@ -636,20 +690,22 @@ namespace hpx::parallel::detail::rfa {
             const auto* const bins = binned_bins(X_index);
             if (X_index == 0)
             {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
+                Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0) *
                     (double) EXPANSION;
-                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
+                Y += (double) carry(inccarX) *
+                    (double) (((double) bins[1]) / 6.0);
                 Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
                 i = 2;
             }
             else
             {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0);
+                Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0);
                 i = 1;
             }
             for (; i < FOLD; i++)
             {
-                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
+                Y += (double) carry(i * inccarX) *
+                    (double) (((double) bins[i]) / 6.0);
                 Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
             }
             Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
@@ -694,8 +750,10 @@ namespace hpx::parallel::detail::rfa {
             if (shift > 0)
             {
                 const auto* const bins = binned_bins(Y_index);
-                //shift Y upwards and add X to Y
-#pragma unroll
+//shift Y upwards and add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = FOLD - 1; i >= 1; i--)
                 {
                     if (i < shift)
@@ -705,7 +763,9 @@ namespace hpx::parallel::detail::rfa {
                     carry(i * inccarY) =
                         x.carry(i * inccarX) + carry((i - shift) * inccarY);
                 }
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD; i++)
                 {
                     if (i == shift)
@@ -717,8 +777,10 @@ namespace hpx::parallel::detail::rfa {
             else if (shift < 0)
             {
                 const auto* const bins = binned_bins(X_index);
-                //shift X upwards and add X to Y
-#pragma unroll
+//shift X upwards and add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD; i++)
                 {
                     if (i < -shift)
@@ -731,8 +793,10 @@ namespace hpx::parallel::detail::rfa {
             else if (shift == 0)
             {
                 const auto* const bins = binned_bins(X_index);
-                // add X to Y
-#pragma unroll
+// add X to Y
+#if !defined(HPX_CLANG_VERSION)
+                HPX_UNROLL
+#endif
                 for (int i = 0; i < FOLD; i++)
                 {
                     primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
@@ -771,7 +835,7 @@ namespace hpx::parallel::detail::rfa {
         }
 
         ///Return the endurance of the binned fp
-        constexpr int endurance() const
+        constexpr size_t endurance() const
         {
             return ENDURANCE;
         }
@@ -867,11 +931,11 @@ namespace hpx::parallel::detail::rfa {
         {
             if (std::is_same_v<ftype, float>)
             {
-                return binned_conv_single(1, 1);
+                return static_cast<ftype>(binned_conv_single(1, 1));
             }
             else
             {
-                return binned_conv_double(1, 1);
+                return static_cast<ftype>(binned_conv_double(1, 1));
             }
         }
 
@@ -888,7 +952,8 @@ namespace hpx::parallel::detail::rfa {
         {
             const double X = std::abs(max_abs_val);
             const double S = std::abs(binned_sum);
-            return static_cast<ftype>(max(X, std::ldexp(0.5, MIN_EXP - 1)) *
+            return static_cast<ftype>(
+                (std::max)(X, std::ldexp(0.5, MIN_EXP - 1)) *
                     std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
                 ((7.0 * EPSILON) /
                     (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
@@ -973,7 +1038,7 @@ namespace hpx::parallel::detail::rfa {
             T max_abs_val = input[0];
             for (size_t i = 0; i < N; i++)
             {
-                max_abs_val = max(max_abs_val, std::abs(input[i]));
+                max_abs_val = (std::max)(max_abs_val, std::abs(input[i]));
             }
             add(input, N, max_abs_val);
         }
@@ -1142,4 +1207,4 @@ namespace hpx::parallel::detail::rfa {
         }
     };
 
-}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
+}    // namespace hpx::parallel::detail::rfa
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 92dd2e7f3dc..5a06c509efd 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -4,8 +4,6 @@
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#pragma once
-
 #include <hpx/init.hpp>
 #include <hpx/modules/testing.hpp>
 #include <hpx/parallel/algorithms/detail/rfa.hpp>
@@ -19,6 +17,7 @@
 #include <limits>
 #include <numeric>
 #include <random>
+#include <string>
 #include <vector>
 
 #include "test_utils.hpp"
@@ -27,11 +26,12 @@ int seed = std::random_device{}();
 std::mt19937 gen(seed);
 
 template <typename T>
-T get_rand(
-    T LO = std::numeric_limits<T>::min(), T HI = std::numeric_limits<T>::max())
+T get_rand(T LO = (std::numeric_limits<T>::min)(),
+    T HI = (std::numeric_limits<T>::max)())
 {
     return LO +
-        static_cast<T>(std::rand()) / (static_cast<T>(RAND_MAX / (HI - LO)));
+        static_cast<T>(std::rand()) /
+        (static_cast<T>(static_cast<T>((RAND_MAX)) / (HI - LO)));
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -42,10 +42,12 @@ void test_reduce1(IteratorTag)
 {
     // check if different type for deterministic and nondeeterministic
     // and same result i.e. correct computation
-    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using base_iterator_det =
+        typename std::vector<FloatTypeDeterministic>::iterator;
     using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
 
-    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using base_iterator_ndet =
+        typename std::vector<FloatTypeNonDeterministic>::iterator;
     using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
 
     std::vector<FloatTypeDeterministic> deterministic(LEN);
@@ -75,8 +77,8 @@ void test_reduce1(IteratorTag)
     FloatTypeNonDeterministic r3 = std::accumulate(
         nondeterministic.begin(), nondeterministic.end(), val_non_det);
 
-    HPX_TEST_EQ(r1, r3);
-    HPX_TEST_EQ(r2, r3);
+    HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r1), r3);
+    HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r2), r3);
 }
 
 template <typename IteratorTag, typename FloatTypeDeterministic,
@@ -85,7 +87,8 @@ void test_reduce_determinism(IteratorTag)
 {
     // check if different type for deterministic and nondeeterministic
     // and same result
-    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using base_iterator_det =
+        typename std::vector<FloatTypeDeterministic>::iterator;
     using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
 
     constexpr FloatTypeDeterministic num_bounds_det =
@@ -129,7 +132,8 @@ template <typename IteratorTag, typename FloatTypeNonDeterministic,
     size_t LEN = 10007>
 void test_orig_reduce_determinism(IteratorTag)
 {
-    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using base_iterator_ndet =
+        typename std::vector<FloatTypeNonDeterministic>::iterator;
     using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
 
     constexpr auto num_bounds_ndet =