From 38788e69b664a418ab3e8aa12b255b63cffccb11 Mon Sep 17 00:00:00 2001 From: Hartmut Kaiser Date: Tue, 19 Nov 2024 14:48:54 -0600 Subject: [PATCH 01/10] Replace previously downloaded CDash conv.xsl with local version --- .circleci/config.yml | 7 +-- .circleci/conv.xsl | 121 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 .circleci/conv.xsl diff --git a/.circleci/config.yml b/.circleci/config.yml index 5b856872c90..a16f9e14a81 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -105,12 +105,9 @@ jobs: command: | git clone --depth=1 file:///hpx/source-full source - run: - name: Downloading CTest XML to Junit XML + name: Copying CTest XML to Junit XML command: | - curl \ - https://raw.githubusercontent.com/Kitware/CDash/master/app/cdash/tests/circle/conv.xsl \ - --fail \ - -o /hpx/conv.xsl + cp /hpx/source/.circleci/conv.xsl /hpx/conv.xsl - persist_to_workspace: root: /hpx paths: diff --git a/.circleci/conv.xsl b/.circleci/conv.xsl new file mode 100644 index 00000000000..e5b22ded600 --- /dev/null +++ b/.circleci/conv.xsl @@ -0,0 +1,121 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BuildName: + BuildStamp: + Name: + Generator: + CompilerName: + OSName: + Hostname: + OSRelease: + OSVersion: + OSPlatform: + Is64Bits: + VendorString: + VendorID: + FamilyID: + ModelID: + ProcessorCacheSize: + NumberOfLogicalCPU: + NumberOfPhysicalCPU: + TotalVirtualMemory: + TotalPhysicalMemory: + LogicalProcessorsPerPhysical: + ProcessorClockFrequency: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From fde3d47bd9d9a6f59cc92e2c1eb186bc94129f3f Mon Sep 17 00:00:00 2001 From: Hartmut Kaiser Date: Tue, 19 Nov 2024 15:11:07 -0600 Subject: [PATCH 02/10] Download Boost from their own archives, not from Sourceforge --- .github/workflows/macos_debug_fetch_hwloc.yml | 1 + .github/workflows/windows_release_gcc_mingw.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml index caec2af3ac2..f778a6b117d 100644 --- a/.github/workflows/macos_debug_fetch_hwloc.yml +++ b/.github/workflows/macos_debug_fetch_hwloc.yml @@ -19,6 +19,7 @@ jobs: run: | brew install --overwrite python-tk && \ brew install --overwrite boost gperftools ninja autoconf automake && \ + autoreconf -f -i \ brew upgrade cmake - name: Configure shell: bash diff --git a/.github/workflows/windows_release_gcc_mingw.yml b/.github/workflows/windows_release_gcc_mingw.yml index 97bb5685f0c..b36c02773ed 100644 --- a/.github/workflows/windows_release_gcc_mingw.yml +++ b/.github/workflows/windows_release_gcc_mingw.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2023 The STE||AR-Group +# Copyright (c) 2023-2024 The STE||AR-Group # # SPDX-License-Identifier: BSL-1.0 # Distributed under the Boost Software License, Version 1.0. (See accompanying @@ -22,7 +22,7 @@ jobs: choco install ninja -y md C:\projects $client = new-object System.Net.WebClient - $client.DownloadFile("https://master.dl.sourceforge.net/project/boost/boost/1.78.0/boost_1_78_0.7z","C:\projects\boost_1_78_0.7z") + $client.DownloadFile("https://archives.boost.io/release/1.78.0/source/boost_1_78_0.7z","C:\projects\boost_1_78_0.7z") 7z x C:\projects\boost_1_78_0.7z -y -oC:\projects\boost cd C:\projects\boost\boost_1_78_0 .\bootstrap.bat gcc From 6b9083f75795068c6f2a9dbaf751cefd7c69f14f Mon Sep 17 00:00:00 2001 From: Hartmut Kaiser Date: Wed, 20 Nov 2024 10:13:13 -0600 Subject: [PATCH 03/10] Remove leftovers from libfabric parcelport --- cmake/HPX_SetupBoost.cmake | 7 ------- cmake/toolchains/Cray.cmake | 21 --------------------- cmake/toolchains/CrayKNL.cmake | 21 --------------------- cmake/toolchains/CrayKNLStatic.cmake | 21 --------------------- cmake/toolchains/CrayStatic.cmake | 21 --------------------- 5 files changed, 91 deletions(-) diff --git a/cmake/HPX_SetupBoost.cmake b/cmake/HPX_SetupBoost.cmake index 43360e5571d..256b56e169f 100644 --- a/cmake/HPX_SetupBoost.cmake +++ b/cmake/HPX_SetupBoost.cmake @@ -117,13 +117,6 @@ if(NOT TARGET hpx_dependencies_boost) endif() set(__boost_libraries "") - if(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING - OR HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE - ) - set(__boost_libraries ${__boost_libraries} log log_setup date_time chrono - thread - ) - endif() if(HPX_WITH_GENERIC_CONTEXT_COROUTINES) # if context is needed, we should still link with boost thread and chrono diff --git a/cmake/toolchains/Cray.cmake b/cmake/toolchains/Cray.cmake index 83b9c051f13..e2f369f063f 100644 --- a/cmake/toolchains/Cray.cmake +++ b/cmake/toolchains/Cray.cmake @@ -70,27 +70,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED CACHE BOOL "" ) -set(HPX_WITH_PARCELPORT_LIBFABRIC - ON - CACHE BOOL "" -) -set(HPX_PARCELPORT_LIBFABRIC_PROVIDER - "gni" - CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc" -) -set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS - "256" - CACHE STRING "Max number of messages in flight at once" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE - OFF - CACHE BOOL "Custom libfabric logging flag" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING - OFF - CACHE BOOL "Libfabric parcelport logging on/off flag" -) - # We do a cross compilation here ... set(CMAKE_CROSSCOMPILING ON diff --git a/cmake/toolchains/CrayKNL.cmake b/cmake/toolchains/CrayKNL.cmake index 126bcc9a038..17d06245d37 100644 --- a/cmake/toolchains/CrayKNL.cmake +++ b/cmake/toolchains/CrayKNL.cmake @@ -68,27 +68,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED CACHE BOOL "" ) -set(HPX_WITH_PARCELPORT_LIBFABRIC - ON - CACHE BOOL "" -) -set(HPX_PARCELPORT_LIBFABRIC_PROVIDER - "gni" - CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc" -) -set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS - "256" - CACHE STRING "Max number of messages in flight at once" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE - OFF - CACHE BOOL "Custom libfabric logging flag" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING - OFF - CACHE BOOL "Libfabric parcelport logging on/off flag" -) - # Set the TBBMALLOC_PLATFORM correctly so that find_package(TBBMalloc) sets the # right hints set(TBBMALLOC_PLATFORM diff --git a/cmake/toolchains/CrayKNLStatic.cmake b/cmake/toolchains/CrayKNLStatic.cmake index 97843059eaa..76e6160ba23 100644 --- a/cmake/toolchains/CrayKNLStatic.cmake +++ b/cmake/toolchains/CrayKNLStatic.cmake @@ -52,27 +52,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED CACHE BOOL "" ) -set(HPX_WITH_PARCELPORT_LIBFABRIC - ON - CACHE BOOL "" -) -set(HPX_PARCELPORT_LIBFABRIC_PROVIDER - "gni" - CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc" -) -set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS - "256" - CACHE STRING "Max number of messages in flight at once" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE - OFF - CACHE BOOL "Custom libfabric logging flag" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING - OFF - CACHE BOOL "Libfabric parcelport logging on/off flag" -) - # Set the TBBMALLOC_PLATFORM correctly so that find_package(TBBMalloc) sets the # right hints set(TBBMALLOC_PLATFORM diff --git a/cmake/toolchains/CrayStatic.cmake b/cmake/toolchains/CrayStatic.cmake index f89757a2e72..6d1bc206108 100644 --- a/cmake/toolchains/CrayStatic.cmake +++ b/cmake/toolchains/CrayStatic.cmake @@ -62,24 +62,3 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED ON CACHE BOOL "" ) - -set(HPX_WITH_PARCELPORT_LIBFABRIC - ON - CACHE BOOL "" -) -set(HPX_PARCELPORT_LIBFABRIC_PROVIDER - "gni" - CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc" -) -set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS - "256" - CACHE STRING "Max number of messages in flight at once" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE - OFF - CACHE BOOL "Custom libfabric logging flag" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING - OFF - CACHE BOOL "Libfabric parcelport logging on/off flag" -) From 76d847a1496a6f8f346ff19aa4ddaf32dfef1ce3 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 23 Jan 2024 11:30:59 -0600 Subject: [PATCH 04/10] [macOS] Comparison between exactly same types Signed-off-by: Shreyas Atre --- libs/core/concurrency/tests/unit/tagged_ptr.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp index b29652a3ede..d86fc577541 100644 --- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp +++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp @@ -25,7 +25,7 @@ void tagged_ptr_test() i = j; HPX_TEST_EQ(i.get_ptr(), &b); - HPX_TEST_EQ(i.get_tag(), 1); + HPX_TEST_EQ(i.get_tag(), 1UL); } { @@ -43,7 +43,7 @@ void tagged_ptr_test() { tagged_ptr j(&a, max_tag); - HPX_TEST_EQ(j.get_next_tag(), 0); + HPX_TEST_EQ(j.get_next_tag(), 0UL); } { From 75973a8efc7640fed711493f06cc0dbe38d1ceee Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 23 Jan 2024 11:32:20 -0600 Subject: [PATCH 05/10] [macOS] Apple does not seem to have any typedef for unsigned long int Signed-off-by: Shreyas Atre --- libs/core/debugging/src/print.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp index 3d7cf5da2aa..8a01d957485 100644 --- a/libs/core/debugging/src/print.cpp +++ b/libs/core/debugging/src/print.cpp @@ -57,6 +57,10 @@ namespace hpx::debug { std::ostream&, std::int32_t const&, int); template HPX_CORE_EXPORT void print_dec( std::ostream&, std::int64_t const&, int); +#ifdef __APPLE__ + template HPX_CORE_EXPORT void print_dec( + std::ostream&, unsigned long const&, int); +#endif template HPX_CORE_EXPORT void print_dec( std::ostream&, std::uint64_t const&, int); From 5c36a4e47f0aba76889cf51fa20c2ef282e06d60 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Mon, 25 Nov 2024 11:10:30 -0600 Subject: [PATCH 06/10] Execute at_quick_exit feature test - macOSs command line tools 16+ apparently has issues with at_quick_exit not found - For those, this feature macro should be disabled as it cannot execute at runtime but compiles Signed-off-by: Shreyas Atre --- cmake/HPX_AddConfigTest.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/HPX_AddConfigTest.cmake b/cmake/HPX_AddConfigTest.cmake index 92b24ceec19..b0e1ea43745 100644 --- a/cmake/HPX_AddConfigTest.cmake +++ b/cmake/HPX_AddConfigTest.cmake @@ -408,7 +408,7 @@ function(hpx_check_for_cxx11_std_quick_exit) add_hpx_config_test( HPX_WITH_CXX11_STD_QUICK_EXIT SOURCE cmake/tests/cxx11_std_quick_exit.cpp - FILE ${ARGN} + FILE EXECUTE ${ARGN} ) endfunction() From 1682407fd0dfa4f1407a3c5f034f9de385efebd1 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Mon, 25 Nov 2024 11:15:59 -0600 Subject: [PATCH 07/10] Revert "[macOS] Apple does not seem to have any typedef for unsigned long int" This reverts commit 75973a8efc7640fed711493f06cc0dbe38d1ceee. Signed-off-by: Shreyas Atre --- libs/core/debugging/src/print.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp index 8a01d957485..3d7cf5da2aa 100644 --- a/libs/core/debugging/src/print.cpp +++ b/libs/core/debugging/src/print.cpp @@ -57,10 +57,6 @@ namespace hpx::debug { std::ostream&, std::int32_t const&, int); template HPX_CORE_EXPORT void print_dec( std::ostream&, std::int64_t const&, int); -#ifdef __APPLE__ - template HPX_CORE_EXPORT void print_dec( - std::ostream&, unsigned long const&, int); -#endif template HPX_CORE_EXPORT void print_dec( std::ostream&, std::uint64_t const&, int); From 6ff0c9d27530074f7d5cdd151522f2a40fdbc9f1 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Mon, 25 Nov 2024 11:16:32 -0600 Subject: [PATCH 08/10] Revert "[macOS] Comparison between exactly same types" This reverts commit 76d847a1496a6f8f346ff19aa4ddaf32dfef1ce3. Signed-off-by: Shreyas Atre --- libs/core/concurrency/tests/unit/tagged_ptr.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp index d86fc577541..b29652a3ede 100644 --- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp +++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp @@ -25,7 +25,7 @@ void tagged_ptr_test() i = j; HPX_TEST_EQ(i.get_ptr(), &b); - HPX_TEST_EQ(i.get_tag(), 1UL); + HPX_TEST_EQ(i.get_tag(), 1); } { @@ -43,7 +43,7 @@ void tagged_ptr_test() { tagged_ptr j(&a, max_tag); - HPX_TEST_EQ(j.get_next_tag(), 0UL); + HPX_TEST_EQ(j.get_next_tag(), 0); } { From 520f1613edd947bdb30e3702d39c7d9bb6f96d29 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Mon, 16 Dec 2024 19:24:47 -0600 Subject: [PATCH 09/10] Add reproducible summation reduction - Currently supports sequential version of reduction - Tests determinism by shuffling order of floating point numbers Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 80 ++ .../hpx/parallel/algorithms/detail/rfa.hpp | 1145 +++++++++++++++++ .../algorithms/reduce_deterministic.hpp | 537 ++++++++ .../tests/unit/algorithms/CMakeLists.txt | 1 + .../unit/algorithms/reduce_deterministic.cpp | 272 ++++ 5 files changed, 2035 insertions(+) create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp create mode 100644 libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp new file mode 100644 index 00000000000..87069858f49 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -0,0 +1,80 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "rfa.hpp" + +namespace hpx::parallel::detail { + + template + struct sequential_reduce_deterministic_t final + : hpx::functional::detail::tag_fallback< + sequential_reduce_deterministic_t> + { + private: + template + friend constexpr T tag_fallback_invoke( + sequential_reduce_deterministic_t, ExPolicy&&, InIterB first, + InIterE last, T init, Reduce&& r) + { + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + + hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + rfa.set_max_abs_val(init); + rfa.unsafe_add(init); + rfa.renorm(); + size_t count = 0; + T max_val = std::abs(*first); + for (auto e = first; e != last; ++e) + { + T temp_max_val = std::abs(static_cast(*e)); + if (max_val < temp_max_val) + { + rfa.set_max_abs_val(temp_max_val); + max_val = temp_max_val; + } + rfa.unsafe_add(*e); + count++; + if (count == rfa.endurance()) + { + rfa.renorm(); + count = 0; + } + } + return rfa.conv(); + } + }; + +#if !defined(HPX_COMPUTE_DEVICE_CODE) + template + inline constexpr sequential_reduce_deterministic_t + sequential_reduce_deterministic = + sequential_reduce_deterministic_t{}; +#else + template + HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic( + Args&&... args) + { + return sequential_reduce_deterministic_t{}( + std::forward(args)...); + } +#endif + +} // namespace hpx::parallel::detail diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp new file mode 100644 index 00000000000..302f823fab7 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -0,0 +1,1145 @@ +//Reproducible Floating Point Accumulations via Binned Floating Point +//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. +//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. +// +//The code accomplishes several objectives: +// +//1. Reproducible summation, independent of summation order, assuming only a +// subset of the IEEE 754 Floating Point Standard +// +//2. Has accuracy at least as good as conventional summation, and tunable +// +//3. Handles overflow, underflow, and other exceptions reproducibly. +// +//4. Makes only one read-only pass over the summands. +// +//5. Requires only one parallel reduction. +// +//6. Uses minimal memory (6 doubles per accumulator with fold=3). +// +//7. Relatively easy to use + +#pragma once + +#include +#include +#include +#include +#include + +namespace hpx::parallel::detail::rfa { + template + struct type4 + { + F x; + F y; + F z; + F w; + }; + + template + struct type2 + { + F x; + F y; + }; + using float4 = type4; + using double4 = type4; + using float2 = type2; + using double2 = type2; + + auto abs_max(float4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(float2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + +// disable zero checks +#define DISABLE_ZERO + +// disable nan / infinity checks +#define DISABLE_NANINF + +// jump table for indexing into data +#define MAX_JUMP 5 + static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max"); + + template + inline constexpr Real ldexp_impl(Real arg, int exp) noexcept + { + return std::ldexp(arg, exp); + // while (arg == 0) + // { + // return arg; + // } + // while (exp > 0) + // { + // arg *= static_cast(2); + // --exp; + // } + // while (exp < 0) + // { + // arg /= static_cast(2); + // ++exp; + // } + + // return arg; + } + + template + struct RFA_bins + { + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + + ///The binned floating-point reference bins + std::array bins = {}; + + constexpr ftype& operator[](int d) + { + return bins[d]; + } + + void initialize_bins() + { + if constexpr (std::is_same_v) + { + bins[0] = std::ldexp(0.75, MAX_EXP); + } + else + { + bins[0] = 2.0 * std::ldexp(0.75, MAX_EXP - 1); + } + + for (int index = 1; index <= MAXINDEX; index++) + { + bins[index] = std::ldexp(0.75, + MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH); + } + for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++) + { + bins[index] = bins[index - 1]; + } + } + }; + + static char __rfa_bin_host_buffer__[sizeof(RFA_bins)]; + + ///Class to hold a reproducible summation of the numbers passed to it + /// + ///@param ftype Floating-point data type; either `float` or `double + ///@param FOLD The fold; use 3 as a default unless you understand it. + template ::value>* = + nullptr> + class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator + { + public: + using ftype = ftype_; + static constexpr int FOLD = FOLD_; + + private: + std::array data = {0}; + + ///Floating-point precision bin width + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + ///Binned floating-point compression factor + ///This factor is used to scale down inputs before deposition into the bin of + ///highest index + static constexpr auto COMPRESSION = + 1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1)); + ///Binned double precision expansion factor + ///This factor is used to scale up inputs after deposition into the bin of + ///highest index + static constexpr auto EXPANSION = + 1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1)); + static constexpr auto EXP_BIAS = MAX_EXP - 2; + static constexpr auto EPSILON = std::numeric_limits::epsilon(); + ///Binned floating-point deposit endurance + ///The number of deposits that can be performed before a renorm is necessary. + ///Applies also to binned complex double precision. + static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); + + ///Return a binned floating-point reference bin + inline const ftype* binned_bins(const int x) const + { + return &reinterpret_cast&>( + __rfa_bin_host_buffer__)[x]; + } + + ///Get the bit representation of a float + static inline uint32_t& get_bits(float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a double + static inline uint64_t& get_bits(double& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const float + static inline uint32_t get_bits(const float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const double + static inline uint64_t get_bits(const double& x) + { + return *reinterpret_cast(&x); + } + + ///Return primary vector value const ref + inline const ftype& primary(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[0]; + case 1: + if constexpr (FOLD >= 2) + return data[1]; + case 2: + if constexpr (FOLD >= 3) + return data[2]; + case 3: + if constexpr (FOLD >= 4) + return data[3]; + case 4: + if constexpr (FOLD >= 5) + return data[4]; + default: + return data[FOLD - 1]; + } + } + else + { + return data[i]; + } + } + + ///Return carry vector value const ref + inline const ftype& carry(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[FOLD + 0]; + case 1: + if constexpr (FOLD >= 2) + return data[FOLD + 1]; + case 2: + if constexpr (FOLD >= 3) + return data[FOLD + 2]; + case 3: + if constexpr (FOLD >= 4) + return data[FOLD + 3]; + case 4: + if constexpr (FOLD >= 5) + return data[FOLD + 4]; + default: + return data[2 * FOLD - 1]; + } + } + else + { + return data[FOLD + i]; + } + } + + ///Return primary vector value ref + inline ftype& primary(int i) + { + const auto& c = *this; + return const_cast(c.primary(i)); + } + + ///Return carry vector value ref + inline ftype& carry(int i) + { + const auto& c = *this; + return const_cast(c.carry(i)); + } + +#ifdef DISABLE_ZERO + static inline constexpr bool ISZERO(const ftype) + { + return false; + } +#else + static inline constexpr bool ISZERO(const ftype x) + { + return x == 0.0; + } +#endif + +#ifdef DISABLE_NANINF + static inline constexpr int ISNANINF(const ftype) + { + return false; + } +#else + static inline constexpr int ISNANINF(const ftype x) + { + const auto bits = get_bits(x); + return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) == + ((2ull * MAX_EXP - 1) << (MANT_DIG - 1)); + } +#endif + + static inline constexpr int EXP(const ftype x) + { + const auto bits = get_bits(x); + return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1); + } + + ///Get index of float-point precision + ///The index of a non-binned type is the smallest index a binned type would + ///need to have to sum it reproducibly. Higher indicies correspond to smaller + ///bins. + static inline constexpr int binned_dindex(const ftype x) + { + int exp = EXP(x); + if (exp == 0) + { + if (x == 0.0) + { + return MAXINDEX; + } + else + { + std::frexp(x, &exp); + return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + } + } + return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; + } + + ///Get index of manually specified binned double precision + ///The index of a binned type is the bin that it corresponds to. Higher + ///indicies correspond to smaller bins. + inline int binned_index() const + { + return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - + EXP(primary(0))) / + BIN_WIDTH; + } + + ///Check if index of manually specified binned floating-point is 0 + ///A quick check to determine if the index is 0 + inline bool binned_index0() const + { + return EXP(primary(0)) == MAX_EXP + EXP_BIAS; + } + + ///Update manually specified binned fp with a scalar (X -> Y) + /// + ///This method updates the binned fp to an index suitable for adding numbers + ///with absolute value less than @p max_abs_val + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdupdate( + const ftype max_abs_val, const int incpriY, const int inccarY) + { + if (ISNANINF(primary(0))) + return; + + int X_index = binned_dindex(max_abs_val); + if (ISZERO(primary(0))) + { + const ftype* const bins = binned_bins(X_index); + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) = bins[i]; + carry(i * inccarY) = 0.0; + } + } + else + { + int shift = binned_index() - X_index; + if (shift > 0) + { +#pragma unroll + for (int i = FOLD - 1; i >= 1; i--) + { + if (i < shift) + break; + primary(i * incpriY) = primary((i - shift) * incpriY); + carry(i * inccarY) = carry((i - shift) * inccarY); + } + const ftype* const bins = binned_bins(X_index); +#pragma unroll + for (int j = 0; j < FOLD; j++) + { + if (j >= shift) + break; + primary(j * incpriY) = bins[j]; + carry(j * inccarY) = 0.0; + } + } + } + } + + ///Add scalar @p X to suitably binned manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y where the index of Y is + ///larger than the index of @p X + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + void binned_dmddeposit(const ftype X, const int incpriY) + { + ftype M; + ftype x = X; + + if (ISNANINF(x) || ISNANINF(primary(0))) + { + primary(0) += x; + return; + } + + if (binned_index0()) + { + M = primary(0); + ftype qd = x * COMPRESSION; + auto& ql = get_bits(qd); + ql |= 1; + qd += M; + primary(0) = qd; + M -= qd; + M *= EXPANSION * 0.5; + x += M; + x += M; +#pragma unroll + for (int i = 1; i < FOLD - 1; i++) + { + M = primary(i * incpriY); + qd = x; + ql |= 1; + qd += M; + primary(i * incpriY) = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + primary((FOLD - 1) * incpriY) += qd; + } + else + { + ftype qd = x; + auto& ql = get_bits(qd); +#pragma unroll + for (int i = 0; i < FOLD - 1; i++) + { + M = primary(i * incpriY); + qd = x; + ql |= 1; + qd += M; + primary(i * incpriY) = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + primary((FOLD - 1) * incpriY) += qd; + } + } + + ///Renormalize manually specified binned double precision + /// + ///Renormalization keeps the primary vector within the necessary bins by + ///shifting over to the carry vector + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + inline void binned_dmrenorm(const int incpriX, const int inccarX) + { + if (ISZERO(primary(0)) || ISNANINF(primary(0))) + return; + + for (int i = 0; i < FOLD; i++) + { + auto tmp_renormd = primary(i * incpriX); + auto& tmp_renorml = get_bits(tmp_renormd); + + carry(i * inccarX) += + (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2; + + tmp_renorml &= ~(1ull << (MANT_DIG - 3)); + tmp_renorml |= 1ull << (MANT_DIG - 2); + primary(i * incpriX) = tmp_renormd; + } + } + + ///Add scalar to manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdadd(const ftype X, const int incpriY, const int inccarY) + { + binned_dmdupdate(X, incpriY, inccarY); + binned_dmddeposit(X, incpriY); + binned_dmrenorm(incpriY, inccarY); + } + + ///Convert manually specified binned fp to native double-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + double binned_conv_double(const int incpriX, const int inccarX) const + { + int i = 0; + + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) + return 0.0; + + double Y = 0.0; + double scale_down; + double scale_up; + int scaled; + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index <= (3 * MANT_DIG) / BIN_WIDTH) + { + scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); + scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); + scaled = std::max( + std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); + if (X_index == 0) + { + Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); + Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); + Y += (primary(0) - bins[0]) * scale_down * EXPANSION; + i = 2; + } + else + { + Y += carry(0) * ((bins[0] / 6.0) * scale_down); + i = 1; + } + for (; i < scaled; i++) + { + Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); + Y += + (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; + } + if (i == FOLD) + { + Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * + scale_down; + return Y * scale_up; + } + if (std::isinf(Y * scale_up)) + { + return Y * scale_up; + } + Y *= scale_up; + for (; i < FOLD; i++) + { + Y += carry(i * inccarX) * (bins[i] / 6.0); + Y += primary((i - 1) * incpriX) - bins[i - 1]; + } + Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; + } + else + { + Y += carry(0) * (bins[0] / 6.0); + for (i = 1; i < FOLD; i++) + { + Y += carry(i * inccarX) * (bins[i] / 6.0); + Y += (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + } + return Y; + } + + ///Convert manually specified binned fp to native single-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + float binned_conv_single(const int incpriX, const int inccarX) const + { + int i = 0; + double Y = 0.0; + + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) + return 0.0; + + //Note that the following order of summation is in order of decreasing + //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and + //the number of carries equal to 1. + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index == 0) + { + Y += (double) carry(0) * (double) (bins[0] / 6.0) * + (double) EXPANSION; + Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); + Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; + i = 2; + } + else + { + Y += (double) carry(0) * (double) (bins[0] / 6.0); + i = 1; + } + for (; i < FOLD; i++) + { + Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + + return (float) Y; + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + /// + ///@param other Another binned fp of the same type + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdmadd(const ReproducibleFloatingAccumulator& x, + const int incpriX, const int inccarX, const int incpriY, + const int inccarY) + { + if (ISZERO(x.primary(0))) + return; + + if (ISZERO(primary(0))) + { + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); + } + return; + } + + if (ISNANINF(x.primary(0)) || ISNANINF(primary(0))) + { + primary(0) += x.primary(0); + return; + } + + const auto X_index = x.binned_index(); + const auto Y_index = this->binned_index(); + const auto shift = Y_index - X_index; + if (shift > 0) + { + const auto* const bins = binned_bins(Y_index); + //shift Y upwards and add X to Y +#pragma unroll + for (int i = FOLD - 1; i >= 1; i--) + { + if (i < shift) + break; + primary(i * incpriY) = x.primary(i * incpriX) + + (primary((i - shift) * incpriY) - bins[i - shift]); + carry(i * inccarY) = + x.carry(i * inccarX) + carry((i - shift) * inccarY); + } +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + if (i == shift) + break; + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); + } + } + else if (shift < 0) + { + const auto* const bins = binned_bins(X_index); + //shift X upwards and add X to Y +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + if (i < -shift) + continue; + primary(i * incpriY) += + x.primary((i + shift) * incpriX) - bins[i + shift]; + carry(i * inccarY) += x.carry((i + shift) * inccarX); + } + } + else if (shift == 0) + { + const auto* const bins = binned_bins(X_index); + // add X to Y +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; + carry(i * inccarY) += x.carry(i * inccarX); + } + } + + binned_dmrenorm(incpriY, inccarY); + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + void binned_dbdbadd(const ReproducibleFloatingAccumulator& other) + { + binned_dmdmadd(other, 1, 1, 1, 1); + } + + public: + ReproducibleFloatingAccumulator() = default; + ReproducibleFloatingAccumulator( + const ReproducibleFloatingAccumulator&) = default; + ///Sets this binned fp equal to another binned fp + ReproducibleFloatingAccumulator& operator=( + const ReproducibleFloatingAccumulator&) = default; + + ///Set the binned fp to zero + void zero() + { + data = {0}; + } + + ///Return the fold of the binned fp + constexpr int fold() const + { + return FOLD; + } + + ///Return the endurance of the binned fp + constexpr int endurance() const + { + return ENDURANCE; + } + + ///Returns the number of reference bins. Used for judging memory usage. + constexpr size_t number_of_reference_bins() + { + return std::array::size(); + } + + ///Accumulate an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator+=(const U x) + { + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Accumulate-subtract an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator-=(const U x) + { + binned_dmdadd(-static_cast(x), 1, 1); + return *this; + } + + ///Accumulate a binned fp @p x into the binned fp. + ReproducibleFloatingAccumulator& operator+=( + const ReproducibleFloatingAccumulator& other) + { + binned_dbdbadd(other); + return *this; + } + + ///Accumulate-subtract a binned fp @p x into the binned fp. + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator& operator-=( + const ReproducibleFloatingAccumulator& other) + { + const auto temp = -other; + binned_dbdbadd(temp); + } + + ///Determines if two binned fp are equal + bool operator==(const ReproducibleFloatingAccumulator& other) const + { + return data == other.data; + } + + ///Determines if two binned fp are not equal + bool operator!=(const ReproducibleFloatingAccumulator& other) const + { + return !operator==(other); + } + + ///Sets this binned fp equal to the arithmetic value @p x + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator=(const U x) + { + zero(); + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Returns the negative of this binned fp + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator operator-() + { + constexpr int incpriX = 1; + constexpr int inccarX = 1; + ReproducibleFloatingAccumulator temp = *this; + if (primary(0) != 0.0) + { + const auto* const bins = binned_bins(binned_index()); + for (int i = 0; i < FOLD; i++) + { + temp.primary(i * incpriX) = + bins[i] - (primary(i * incpriX) - bins[i]); + temp.carry(i * inccarX) = -carry(i * inccarX); + } + } + return temp; + } + + ///Convert this binned fp into its native floating-point representation + ftype conv() const + { + if (std::is_same_v) + { + return binned_conv_single(1, 1); + } + else + { + return binned_conv_double(1, 1); + } + } + + ///@brief Get binned fp summation error bound + /// + ///This is a bound on the absolute error of a summation using binned types + /// + ///@param N The number of single precision floating point summands + ///@param max_abs_val The summand of maximum absolute value + ///@param binned_sum The value of the sum computed using binned types + ///@return The absolute error bound + static constexpr ftype error_bound( + const uint64_t N, const ftype max_abs_val, const ftype binned_sum) + { + const double X = std::abs(max_abs_val); + const double S = std::abs(binned_sum); + return static_cast(max(X, std::ldexp(0.5, MIN_EXP - 1)) * + std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + + ((7.0 * EPSILON) / + (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - + 7.0 * EPSILON)) * + S); + } + + ///Add @p x to the binned fp + void add(const ftype x) + { + binned_dmdadd(x, 1, 1); + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///@param first Start of range + ///@param last End of range + ///@param max_abs_val Maximum absolute value of any member of the range + template + void add(InputIt first, InputIt last, const ftype max_abs_val) + { + binned_dmdupdate(std::abs(max_abs_val), 1, 1); + size_t count = 0; + size_t N = last - first; + for (; first != last; first++, count++) + { + binned_dmddeposit(static_cast(*first), 1); + // first conditional allows compiler to remove the call here when possible + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param first Start of range + ///@param last End of range + template + void add(InputIt first, InputIt last) + { + const auto max_abs_val = *std::max_element( + first, last, [](const auto& a, const auto& b) { + return std::abs(a) < std::abs(b); + }); + add(first, last, static_cast(max_abs_val)); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///@param input Start of the range + ///@param N Number of elements to add + ///@param max_abs_val Maximum absolute value of any member of the range + template >* = nullptr> + void add(const T* input, const size_t N, const ftype max_abs_val) + { + if (N == 0) + return; + add(input, input + N, max_abs_val); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param input Start of the range + ///@param N Number of elements to add + template >* = nullptr> + void add(const T* input, const size_t N) + { + if (N == 0) + return; + + T max_abs_val = input[0]; + for (size_t i = 0; i < N; i++) + { + max_abs_val = max(max_abs_val, std::abs(input[i])); + } + add(input, N, max_abs_val); + } + + ///Accumulate a float4 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float4& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + binned_dmddeposit(static_cast(x.z), 1); + binned_dmddeposit(static_cast(x.w), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const double2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + void add(const float4* input, const size_t N, float max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + binned_dmddeposit(static_cast(input[i].z), 1); + binned_dmddeposit(static_cast(input[i].w), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const double2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const float2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const float4* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const double2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const float2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + ////////////////////////////////////// + //MANUAL OPERATIONS; USE WISELY + ////////////////////////////////////// + + ///Rebins for repeated accumulation of scalars with magnitude <= @p mav + /// + ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator + ///with `unsafe_add` after which `renorm()` must be called. See the source of + ///`add()` for an example + template >* = nullptr> + void set_max_abs_val(const T mav) + { + binned_dmdupdate(std::abs(mav), 1, 1); + } + + ///Add @p x to the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` + void unsafe_add(const ftype x) + { + binned_dmddeposit(x, 1); + } + + ///Renormalizes the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` and one or + ///more calls to `unsafe_add()` + void renorm() + { + binned_dmrenorm(1, 1); + } + }; + +} // namespace hpx::parallel::detail::rfa \ No newline at end of file diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp new file mode 100644 index 00000000000..9920d3bd7a6 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -0,0 +1,537 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +/// \file parallel/algorithms/reduce_deterministic.hpp +/// \page hpx::reduce_deterministic +/// \headerfile hpx/algorithm.hpp + +#pragma once + +#if defined(DOXYGEN) + +namespace hpx { + + // clang-format off + + /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// predicate \a f. + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// \tparam F The type of the function/function object to use + /// (deduced). Unlike its sequential form, the parallel + /// overload of \a reduce requires \a F to meet the + /// requirements of \a CopyConstructible. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// \param f Specifies the function (or function object) which + /// will be invoked for each of the elements in the + /// sequence specified by [first, last). This is a + /// binary predicate. The signature of this predicate + /// should be equivalent to: + /// \code + /// Ret fun(const Type1 &a, const Type1 &b); + /// \endcode \n + /// The signature does not need to have const&. + /// The types \a Type1 \a Ret must be + /// such that an object of type \a FwdIter can be + /// dereferenced and then implicitly converted to any + /// of those types. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a copy_if algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns \a T otherwise. + /// The \a reduce algorithm returns the result of the + /// generalized sum over the elements given by the input range + /// [first, last). + /// + /// \note GENERALIZED_SUM(op, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + hpx::parallel::util::detail::algorithm_result_t + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init, F&& f); + + /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a copy_if algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns \a T otherwise. + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + util::detail::algorithm_result_t + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init); + + /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns T otherwise (where T is the value_type of + /// \a FwdIter). + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note The type of the initial value (and the result type) \a T is + /// determined from the value_type of the used \a FwdIter. + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template + typename hpx::parallel::util::detail::algorithm_result::value_type + >::type + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last); + + /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// predicate \a f. + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// \tparam F The type of the function/function object to use + /// (deduced). Unlike its sequential form, the parallel + /// overload of \a reduce requires \a F to meet the + /// requirements of \a CopyConstructible. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// \param f Specifies the function (or function object) which + /// will be invoked for each of the elements in the + /// sequence specified by [first, last). This is a + /// binary predicate. The signature of this predicate + /// should be equivalent to: + /// \code + /// Ret fun(const Type1 &a, const Type1 &b); + /// \endcode \n + /// The signature does not need to have const&. + /// The types \a Type1 \a Ret must be + /// such that an object of type \a InIter can be + /// dereferenced and then implicitly converted to any + /// of those types. + /// + /// \returns The \a reduce algorithm returns \a T. + /// The \a reduce algorithm returns the result of the + /// generalized sum over the elements given by the input range + /// [first, last). + /// + /// \note GENERALIZED_SUM(op, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + T reduce_deterministic(FwdIter first, FwdIter last, T init, F&& f); + + /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// + /// \returns The \a reduce algorithm returns a \a T. + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + T reduce_deterministic(FwdIter first, FwdIter last, T init); + + /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// + /// \returns The \a reduce algorithm returns \a T (where T is the + /// value_type of \a FwdIter). + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note The type of the initial value (and the result type) \a T is + /// determined from the value_type of the used \a FwdIter. + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template + typename std::iterator_traits::value_type + reduce_deterministic(FwdIter first, FwdIter last); + // clang-format on +} // namespace hpx + +#else // DOXYGEN + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace hpx::parallel { + + /////////////////////////////////////////////////////////////////////////// + // reduce + namespace detail { + + /// \cond NOINTERNAL + template + struct reduce_deterministic + : public algorithm, T> + { + constexpr reduce_deterministic() noexcept + : algorithm("reduce_deterministic") + { + } + + template + static constexpr T sequential(ExPolicy&& policy, InIterB first, + InIterE last, T_&& init, Reduce&& r) + { + return hpx::parallel::detail::sequential_reduce_deterministic< + ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last, + HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r)); + } + }; + /// \endcond + } // namespace detail +} // namespace hpx::parallel + +namespace hpx { + + /////////////////////////////////////////////////////////////////////////// + // CPO for hpx::reduce + inline constexpr struct reduce_deterministic_t final + : hpx::detail::tag_parallel_algorithm + { + private: + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::is_execution_policy_v && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last, T init, F f) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init), + HPX_MOVE(f)); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::is_execution_policy_v && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last, T init) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init), + std::plus<>{}); + } + + // clang-format off + template && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t::value_type> + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + using value_type = + typename std::iterator_traits::value_type; + + return hpx::parallel::detail::reduce_deterministic() + .call(HPX_FORWARD(ExPolicy, policy), first, last, value_type{}, + std::plus<>{}); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::traits::is_iterator_v + )> + // clang-format on + friend T tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last, T init, F f) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + hpx::execution::seq, first, last, HPX_MOVE(init), HPX_MOVE(f)); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::traits::is_iterator_v + )> + // clang-format on + friend T tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last, T init) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + hpx::execution::seq, first, last, HPX_MOVE(init), + std::plus<>{}); + } + + // clang-format off + template + )> + // clang-format on + friend typename std::iterator_traits::value_type + tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + using value_type = + typename std::iterator_traits::value_type; + + return hpx::parallel::detail::reduce_deterministic() + .call(hpx::execution::seq, first, last, value_type{}, + std::plus<>()); + } + } reduce_deterministic{}; +} // namespace hpx + +#endif // DOXYGEN diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt index 7d3a9204530..559ee830030 100644 --- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt @@ -94,6 +94,7 @@ set(tests partition_copy reduce_ reduce_by_key + reduce_deterministic remove remove1 remove2 diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp new file mode 100644 index 00000000000..92dd2e7f3dc --- /dev/null +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -0,0 +1,272 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_utils.hpp" + +int seed = std::random_device{}(); +std::mt19937 gen(seed); + +template +T get_rand( + T LO = std::numeric_limits::min(), T HI = std::numeric_limits::max()) +{ + return LO + + static_cast(std::rand()) / (static_cast(RAND_MAX / (HI - LO))); +} + +/////////////////////////////////////////////////////////////////////////////// + +template +void test_reduce1(IteratorTag) +{ + // check if different type for deterministic and nondeeterministic + // and same result i.e. correct computation + using base_iterator_det = std::vector::iterator; + using iterator_det = test::test_iterator; + + using base_iterator_ndet = std::vector::iterator; + using iterator_ndet = test::test_iterator; + + std::vector deterministic(LEN); + std::vector nondeterministic(LEN); + + std::iota( + deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0)); + + std::iota(nondeterministic.begin(), nondeterministic.end(), + FloatTypeNonDeterministic(0)); + + FloatTypeDeterministic val_det(0); + FloatTypeNonDeterministic val_non_det(0); + auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { + return v1 + v2; + }; + + FloatTypeDeterministic r1 = + hpx::reduce_deterministic(iterator_det(std::begin(deterministic)), + iterator_det(std::end(deterministic)), val_det, op); + + // verify values + FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic)), + iterator_ndet(std::end(nondeterministic)), val_non_det, op); + + FloatTypeNonDeterministic r3 = std::accumulate( + nondeterministic.begin(), nondeterministic.end(), val_non_det); + + HPX_TEST_EQ(r1, r3); + HPX_TEST_EQ(r2, r3); +} + +template +void test_reduce_determinism(IteratorTag) +{ + // check if different type for deterministic and nondeeterministic + // and same result + using base_iterator_det = std::vector::iterator; + using iterator_det = test::test_iterator; + + constexpr FloatTypeDeterministic num_bounds_det = + std::is_same_v ? 1000.0 : 1000000.0; + + std::vector deterministic(LEN); + + for (size_t i = 0; i < LEN; ++i) + { + deterministic[i] = + get_rand(-num_bounds_det, num_bounds_det); + } + + std::vector deterministic_shuffled = deterministic; + + std::shuffle( + deterministic_shuffled.begin(), deterministic_shuffled.end(), gen); + + FloatTypeDeterministic val_det(41.999); + + auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) { + return v1 + v2; + }; + + FloatTypeDeterministic r1 = + hpx::reduce_deterministic(iterator_det(std::begin(deterministic)), + iterator_det(std::end(deterministic)), val_det, op); + + FloatTypeDeterministic r1_shuffled = hpx::reduce_deterministic( + iterator_det(std::begin(deterministic_shuffled)), + iterator_det(std::end(deterministic_shuffled)), val_det, op); + + HPX_TEST_EQ(r1, + r1_shuffled); // Deterministically calculated, should always satisfy +} + +/// This test function is never called because it is not guaranteed to pass +/// It serves an important purpose to demonstrate that floating point summation +/// is not always associative i.e. a+b+c != a+c+b +template +void test_orig_reduce_determinism(IteratorTag) +{ + using base_iterator_ndet = std::vector::iterator; + using iterator_ndet = test::test_iterator; + + constexpr auto num_bounds_ndet = + std::is_same_v ? 1000.0f : 1000000.0f; + + std::vector nondeterministic(LEN); + for (size_t i = 0; i < LEN; ++i) + { + nondeterministic[i] = get_rand( + -num_bounds_ndet, num_bounds_ndet); + } + std::vector nondeterministic_shuffled = + nondeterministic; + std::shuffle(nondeterministic_shuffled.begin(), + nondeterministic_shuffled.end(), gen); + + FloatTypeNonDeterministic val_non_det(41.999); + + auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { + return v1 + v2; + }; + + FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic)), + iterator_ndet(std::end(nondeterministic)), val_non_det, op); + FloatTypeNonDeterministic r2_shuffled = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic_shuffled)), + iterator_ndet(std::end(nondeterministic_shuffled)), val_non_det, op); + + FloatTypeNonDeterministic r3 = std::accumulate( + nondeterministic.begin(), nondeterministic.end(), val_non_det); + FloatTypeNonDeterministic r3_shuffled = + std::accumulate(nondeterministic_shuffled.begin(), + nondeterministic_shuffled.end(), val_non_det); + + /// failed around 131 times out of 1000 on macOS arm + /// Floating point addition is not necessarily associative, + /// might fail on an architecture not yet known with much higher precision + HPX_TEST_NEQ(r2, r2_shuffled); + HPX_TEST_NEQ(r3, r3_shuffled); +} + +template +void test_reduce1() +{ + using namespace hpx::execution; + + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); +} + +template +void test_reduce2() +{ + using namespace hpx::execution; + + test_reduce_determinism(IteratorTag()); + test_reduce_determinism(IteratorTag()); +} + +// template +// void test_reduce3() +// { +// using namespace hpx::execution; + +// test_orig_reduce_determinism(IteratorTag()); +// test_orig_reduce_determinism(IteratorTag()); +// } + +void reduce_test1() +{ + test_reduce1(); + test_reduce2(); + // test_reduce3(); + test_reduce1(); + test_reduce2(); +} + +/////////////////////////////////////////////////////////////////////////////// +int hpx_main(hpx::program_options::variables_map& vm) +{ + unsigned int seed = (unsigned int) std::time(nullptr); + bool seed_random = false; + + if (vm.count("seed")) + { + seed = vm["seed"].as(); + seed_random = true; + } + + if (vm.count("seed-random")) + seed_random = vm["seed-random"].as(); + + if (seed_random) + { + std::cout << "using seed: " << seed << std::endl; + std::cout << "** std::accumulate, hpx::reduce may fail due to " + "non-determinism of the floating summation" + << std::endl; + gen.seed(seed); + std::srand(seed); + } + else + { + gen.seed(223); + std::srand(223); + } + + reduce_test1(); + + return hpx::local::finalize(); +} + +int main(int argc, char* argv[]) +{ + // add command line option which controls the random number generator seed + using namespace hpx::program_options; + options_description desc_commandline( + "Usage: " HPX_APPLICATION_STRING " [options]"); + + desc_commandline.add_options()("seed,s", value(), + "the random number generator seed to use for this run"); + desc_commandline.add_options()("seed-random", value(), + "switch for the random number generator seed to use for this run"); + + // By default this test should run on all available cores + std::vector const cfg = {"hpx.os_threads=all"}; + + // Initialize and run HPX + hpx::local::init_params init_args; + init_args.desc_cmdline = desc_commandline; + init_args.cfg = cfg; + + HPX_TEST_EQ_MSG(hpx::local::init(hpx_main, argc, argv, init_args), 0, + "HPX main exited with non-zero status"); + + return hpx::util::report_errors(); +} From 8b0ce4cd5e34598840e5f28851341de4d8d01213 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 17 Dec 2024 14:22:55 -0500 Subject: [PATCH 10/10] Address inspect tool, check module cmakelists, warnings and spell check - missing includes - prevent max/min being expanded as macros - minor spell check correction - remove pragma once in cpp file - resolve implicit type conversions in rfa type to single and double and other places - add dual license - remove unnecessary command for macos ci - use HPX_UNROLL instead of vanilla pragma - clang-17 cannot unroll so use checks - add typename qualifier for iterator type Signed-off-by: Shreyas Atre --- .github/workflows/macos_debug_fetch_hwloc.yml | 1 - libs/core/algorithms/CMakeLists.txt | 3 + .../detail/reduce_deterministic.hpp | 3 + .../hpx/parallel/algorithms/detail/rfa.hpp | 159 ++++++++++++------ .../unit/algorithms/reduce_deterministic.cpp | 26 +-- 5 files changed, 133 insertions(+), 59 deletions(-) diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml index f778a6b117d..caec2af3ac2 100644 --- a/.github/workflows/macos_debug_fetch_hwloc.yml +++ b/.github/workflows/macos_debug_fetch_hwloc.yml @@ -19,7 +19,6 @@ jobs: run: | brew install --overwrite python-tk && \ brew install --overwrite boost gperftools ninja autoconf automake && \ - autoreconf -f -i \ brew upgrade cmake - name: Configure shell: bash diff --git a/libs/core/algorithms/CMakeLists.txt b/libs/core/algorithms/CMakeLists.txt index 6fcfed897e2..9090345722d 100644 --- a/libs/core/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/CMakeLists.txt @@ -37,7 +37,9 @@ set(algorithms_headers hpx/parallel/algorithms/detail/parallel_stable_sort.hpp hpx/parallel/algorithms/detail/pivot.hpp hpx/parallel/algorithms/detail/reduce.hpp + hpx/parallel/algorithms/detail/reduce_deterministic.hpp hpx/parallel/algorithms/detail/replace.hpp + hpx/parallel/algorithms/detail/rfa.hpp hpx/parallel/algorithms/detail/rotate.hpp hpx/parallel/algorithms/detail/sample_sort.hpp hpx/parallel/algorithms/detail/search.hpp @@ -72,6 +74,7 @@ set(algorithms_headers hpx/parallel/algorithms/partition.hpp hpx/parallel/algorithms/reduce_by_key.hpp hpx/parallel/algorithms/reduce.hpp + hpx/parallel/algorithms/reduce_deterministic.hpp hpx/parallel/algorithms/remove_copy.hpp hpx/parallel/algorithms/remove.hpp hpx/parallel/algorithms/replace.hpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index 87069858f49..b3773088917 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -32,6 +33,8 @@ namespace hpx::parallel::detail { sequential_reduce_deterministic_t, ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r) { + /// TODO: Put constraint on Reduce to be a binary plus operator + (void) r; hpx::parallel::detail::rfa::RFA_bins bins; bins.initialize_bins(); std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp index 302f823fab7..fa9142cdf80 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -1,3 +1,34 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// --------------------------------------------------------------------------- +// This file has been taken from +// https://github.com/maddyscientist/reproducible_floating_sums commit +// b5a065741d4ea459437ca004b508de9dcb6a3e52. The boost copyright has been added +// to this file in accordance with the dual license terms for the Reproducible +// Floating-Point Summations and conformance with the HPX policy +// https://github.com/maddyscientist/reproducible_floating_sums/blob/feature/cuda/LICENSE.md +// --------------------------------------------------------------------------- +// +/// Copyright 2022 Richard Barnes, Peter Ahrens, James Demmel +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// The above copyright notice and this permission notice shall be included in +/// all copies or substantial portions of the Software. +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. //Reproducible Floating Point Accumulations via Binned Floating Point //Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. //ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. @@ -26,6 +57,10 @@ #include #include #include +#include +#include + +#include namespace hpx::parallel::detail::rfa { template @@ -179,7 +214,7 @@ namespace hpx::parallel::detail::rfa { static constexpr int FOLD = FOLD_; private: - std::array data = {0}; + std::array data = {{0}}; ///Floating-point precision bin width static constexpr auto BIN_WIDTH = @@ -351,21 +386,21 @@ namespace hpx::parallel::detail::rfa { ///Get index of float-point precision ///The index of a non-binned type is the smallest index a binned type would - ///need to have to sum it reproducibly. Higher indicies correspond to smaller + ///need to have to sum it reproducibly. Higher indices correspond to smaller ///bins. static inline constexpr int binned_dindex(const ftype x) { int exp = EXP(x); if (exp == 0) { - if (x == 0.0) + if (x == static_cast(0.0)) { return MAXINDEX; } else { std::frexp(x, &exp); - return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + return (std::max)((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); } } return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; @@ -373,7 +408,7 @@ namespace hpx::parallel::detail::rfa { ///Get index of manually specified binned double precision ///The index of a binned type is the bin that it corresponds to. Higher - ///indicies correspond to smaller bins. + ///indices correspond to smaller bins. inline int binned_index() const { return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - @@ -416,7 +451,9 @@ namespace hpx::parallel::detail::rfa { int shift = binned_index() - X_index; if (shift > 0) { -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = FOLD - 1; i >= 1; i--) { if (i < shift) @@ -425,7 +462,9 @@ namespace hpx::parallel::detail::rfa { carry(i * inccarY) = carry((i - shift) * inccarY); } const ftype* const bins = binned_bins(X_index); -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int j = 0; j < FOLD; j++) { if (j >= shift) @@ -457,16 +496,19 @@ namespace hpx::parallel::detail::rfa { if (binned_index0()) { M = primary(0); - ftype qd = x * COMPRESSION; + ftype qd = x * static_cast(COMPRESSION); auto& ql = get_bits(qd); ql |= 1; qd += M; primary(0) = qd; M -= qd; - M *= EXPANSION * 0.5; + auto temp_m = (double) (((double) EXPANSION) * 0.5); + M *= static_cast(temp_m); x += M; x += M; -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 1; i < FOLD - 1; i++) { M = primary(i * incpriY); @@ -485,7 +527,9 @@ namespace hpx::parallel::detail::rfa { { ftype qd = x; auto& ql = get_bits(qd); -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD - 1; i++) { M = primary(i * incpriY); @@ -550,7 +594,7 @@ namespace hpx::parallel::detail::rfa { int i = 0; if (ISNANINF(primary(0))) - return primary(0); + return (double) primary(0); if (ISZERO(primary(0))) return 0.0; @@ -564,29 +608,36 @@ namespace hpx::parallel::detail::rfa { { scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); - scaled = std::max( - std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); + scaled = (std::max)( + (std::min)(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); if (X_index == 0) { - Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); - Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); - Y += (primary(0) - bins[0]) * scale_down * EXPANSION; + Y += ((double) carry(0)) * + ((((double) bins[0]) / 6.0) * scale_down * EXPANSION); + Y += ((double) carry(inccarX)) * + ((((double) bins[1]) / 6.0) * scale_down); + Y += ((double) primary(0) - (double) bins[0]) * scale_down * + EXPANSION; i = 2; } else { - Y += carry(0) * ((bins[0] / 6.0) * scale_down); + Y += ((double) carry(0)) * + (((double) bins[0] / 6.0) * scale_down); i = 1; } for (; i < scaled; i++) { - Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); - Y += - (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; + Y += ((double) carry(i * inccarX)) * + (((double) bins[i] / 6.0) * scale_down); + Y += ((double) primary((i - 1) * incpriX) - + (double) (bins[i - 1])) * + scale_down; } if (i == FOLD) { - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * + Y += ((double) primary((FOLD - 1) * incpriX) - + (double) (bins[FOLD - 1])) * scale_down; return Y * scale_up; } @@ -597,20 +648,23 @@ namespace hpx::parallel::detail::rfa { Y *= scale_up; for (; i < FOLD; i++) { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += primary((i - 1) * incpriX) - bins[i - 1]; + Y += ((double) carry(i * inccarX)) * + ((double) bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } - Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; + Y += ((double) primary((FOLD - 1) * incpriX) - + ((double) bins[FOLD - 1])); } else { - Y += carry(0) * (bins[0] / 6.0); + Y += ((double) carry(0)) * ((double) bins[0] / 6.0); for (i = 1; i < FOLD; i++) { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += (primary((i - 1) * incpriX) - bins[i - 1]); + Y += ((double) carry(i * inccarX)) * + ((double) bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); } return Y; } @@ -627,7 +681,7 @@ namespace hpx::parallel::detail::rfa { if (ISNANINF(primary(0))) return primary(0); if (ISZERO(primary(0))) - return 0.0; + return 0.0f; //Note that the following order of summation is in order of decreasing //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and @@ -636,20 +690,22 @@ namespace hpx::parallel::detail::rfa { const auto* const bins = binned_bins(X_index); if (X_index == 0) { - Y += (double) carry(0) * (double) (bins[0] / 6.0) * + Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0) * (double) EXPANSION; - Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); + Y += (double) carry(inccarX) * + (double) (((double) bins[1]) / 6.0); Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; i = 2; } else { - Y += (double) carry(0) * (double) (bins[0] / 6.0); + Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0); i = 1; } for (; i < FOLD; i++) { - Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); + Y += (double) carry(i * inccarX) * + (double) (((double) bins[i]) / 6.0); Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); @@ -694,8 +750,10 @@ namespace hpx::parallel::detail::rfa { if (shift > 0) { const auto* const bins = binned_bins(Y_index); - //shift Y upwards and add X to Y -#pragma unroll +//shift Y upwards and add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = FOLD - 1; i >= 1; i--) { if (i < shift) @@ -705,7 +763,9 @@ namespace hpx::parallel::detail::rfa { carry(i * inccarY) = x.carry(i * inccarX) + carry((i - shift) * inccarY); } -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { if (i == shift) @@ -717,8 +777,10 @@ namespace hpx::parallel::detail::rfa { else if (shift < 0) { const auto* const bins = binned_bins(X_index); - //shift X upwards and add X to Y -#pragma unroll +//shift X upwards and add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { if (i < -shift) @@ -731,8 +793,10 @@ namespace hpx::parallel::detail::rfa { else if (shift == 0) { const auto* const bins = binned_bins(X_index); - // add X to Y -#pragma unroll +// add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; @@ -771,7 +835,7 @@ namespace hpx::parallel::detail::rfa { } ///Return the endurance of the binned fp - constexpr int endurance() const + constexpr size_t endurance() const { return ENDURANCE; } @@ -867,11 +931,11 @@ namespace hpx::parallel::detail::rfa { { if (std::is_same_v) { - return binned_conv_single(1, 1); + return static_cast(binned_conv_single(1, 1)); } else { - return binned_conv_double(1, 1); + return static_cast(binned_conv_double(1, 1)); } } @@ -888,7 +952,8 @@ namespace hpx::parallel::detail::rfa { { const double X = std::abs(max_abs_val); const double S = std::abs(binned_sum); - return static_cast(max(X, std::ldexp(0.5, MIN_EXP - 1)) * + return static_cast( + (std::max)(X, std::ldexp(0.5, MIN_EXP - 1)) * std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + ((7.0 * EPSILON) / (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - @@ -973,7 +1038,7 @@ namespace hpx::parallel::detail::rfa { T max_abs_val = input[0]; for (size_t i = 0; i < N; i++) { - max_abs_val = max(max_abs_val, std::abs(input[i])); + max_abs_val = (std::max)(max_abs_val, std::abs(input[i])); } add(input, N, max_abs_val); } @@ -1142,4 +1207,4 @@ namespace hpx::parallel::detail::rfa { } }; -} // namespace hpx::parallel::detail::rfa \ No newline at end of file +} // namespace hpx::parallel::detail::rfa diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index 92dd2e7f3dc..5a06c509efd 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -4,8 +4,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#pragma once - #include #include #include @@ -19,6 +17,7 @@ #include #include #include +#include #include #include "test_utils.hpp" @@ -27,11 +26,12 @@ int seed = std::random_device{}(); std::mt19937 gen(seed); template -T get_rand( - T LO = std::numeric_limits::min(), T HI = std::numeric_limits::max()) +T get_rand(T LO = (std::numeric_limits::min)(), + T HI = (std::numeric_limits::max)()) { return LO + - static_cast(std::rand()) / (static_cast(RAND_MAX / (HI - LO))); + static_cast(std::rand()) / + (static_cast(static_cast((RAND_MAX)) / (HI - LO))); } /////////////////////////////////////////////////////////////////////////////// @@ -42,10 +42,12 @@ void test_reduce1(IteratorTag) { // check if different type for deterministic and nondeeterministic // and same result i.e. correct computation - using base_iterator_det = std::vector::iterator; + using base_iterator_det = + typename std::vector::iterator; using iterator_det = test::test_iterator; - using base_iterator_ndet = std::vector::iterator; + using base_iterator_ndet = + typename std::vector::iterator; using iterator_ndet = test::test_iterator; std::vector deterministic(LEN); @@ -75,8 +77,8 @@ void test_reduce1(IteratorTag) FloatTypeNonDeterministic r3 = std::accumulate( nondeterministic.begin(), nondeterministic.end(), val_non_det); - HPX_TEST_EQ(r1, r3); - HPX_TEST_EQ(r2, r3); + HPX_TEST_EQ(static_cast(r1), r3); + HPX_TEST_EQ(static_cast(r2), r3); } template ::iterator; + using base_iterator_det = + typename std::vector::iterator; using iterator_det = test::test_iterator; constexpr FloatTypeDeterministic num_bounds_det = @@ -129,7 +132,8 @@ template void test_orig_reduce_determinism(IteratorTag) { - using base_iterator_ndet = std::vector::iterator; + using base_iterator_ndet = + typename std::vector::iterator; using iterator_ndet = test::test_iterator; constexpr auto num_bounds_ndet =