From 7f908fd780942fabe8dd21bc22b901ece5013c29 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 23 Jan 2024 11:30:59 -0600 Subject: [PATCH 1/9] [macOS] Comparison between exactly same types Signed-off-by: Shreyas Atre --- libs/core/concurrency/tests/unit/tagged_ptr.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp index b29652a3ede1..d86fc5775415 100644 --- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp +++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp @@ -25,7 +25,7 @@ void tagged_ptr_test() i = j; HPX_TEST_EQ(i.get_ptr(), &b); - HPX_TEST_EQ(i.get_tag(), 1); + HPX_TEST_EQ(i.get_tag(), 1UL); } { @@ -43,7 +43,7 @@ void tagged_ptr_test() { tagged_ptr j(&a, max_tag); - HPX_TEST_EQ(j.get_next_tag(), 0); + HPX_TEST_EQ(j.get_next_tag(), 0UL); } { From ed7e16101662114a69abb0fcaa23e6ddfb7b93eb Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 23 Jan 2024 11:32:20 -0600 Subject: [PATCH 2/9] [macOS] Apple does not seem to have any typedef for unsigned long int Signed-off-by: Shreyas Atre --- libs/core/debugging/src/print.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp index 3d7cf5da2aa0..8a01d9574853 100644 --- a/libs/core/debugging/src/print.cpp +++ b/libs/core/debugging/src/print.cpp @@ -57,6 +57,10 @@ namespace hpx::debug { std::ostream&, std::int32_t const&, int); template HPX_CORE_EXPORT void print_dec( std::ostream&, std::int64_t const&, int); +#ifdef __APPLE__ + template HPX_CORE_EXPORT void print_dec( + std::ostream&, unsigned long const&, int); +#endif template HPX_CORE_EXPORT void print_dec( std::ostream&, std::uint64_t const&, int); From b75ea85559702dde98e6d4dc5ccfcc0d2a08fb2c Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Mon, 4 Nov 2024 16:14:13 -0600 Subject: [PATCH 3/9] Start RFA impl Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 56 ++ .../hpx/parallel/algorithms/detail/rfa.hpp | 851 ++++++++++++++++++ .../algorithms/reduce_deterministic.hpp | 569 ++++++++++++ .../tests/unit/algorithms/CMakeLists.txt | 1 + .../unit/algorithms/reduce_deterministic.cpp | 98 ++ 5 files changed, 1575 insertions(+) create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp create mode 100644 libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp new file mode 100644 index 000000000000..79f78b8a858a --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -0,0 +1,56 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "rfa.hpp" + +namespace hpx::parallel::detail { + + template + struct sequential_reduce_deterministic_t final + : hpx::functional::detail::tag_fallback< + sequential_reduce_deterministic_t> + { + private: + template + friend constexpr T tag_fallback_invoke( + sequential_reduce_deterministic_t, ExPolicy&&, InIterB first, + InIterE last, T init, Reduce&& r) + { + RFA::ReproducibleFloatingAccumulator rfa; + rfa += init; + rfa.add(first, last); + return rfa.conv(); + } + }; + +#if !defined(HPX_COMPUTE_DEVICE_CODE) + template + inline constexpr sequential_reduce_deterministic_t + sequential_reduce_deterministic = + sequential_reduce_deterministic_t{}; +#else + template + HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic( + Args&&... args) + { + return sequential_reduce_deterministic_t{}( + std::forward(args)...); + } +#endif + +} // namespace hpx::parallel::detail diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp new file mode 100644 index 000000000000..e7413add8e7e --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -0,0 +1,851 @@ +//Reproducible Floating Point Accumulations via Binned Floating Point +//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. +//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. +// +//The code accomplishes several objectives: +// +//1. Reproducible summation, independent of summation order, assuming only a +// subset of the IEEE 754 Floating Point Standard +// +//2. Has accuracy at least as good as conventional summation, and tunable +// +//3. Handles overflow, underflow, and other exceptions reproducibly. +// +//4. Makes only one read-only pass over the summands. +// +//5. Requires only one parallel reduction. +// +//6. Uses minimal memory (6 doubles per accumulator with fold=3). +// +//7. Relatively easy to use + +#pragma once + +#include +#include +#include +#include +#include + +namespace RFA { + template +inline constexpr Real ldexp_impl(Real arg, int exp) noexcept +{ + while(exp > 0) + { + arg *= 2; + --exp; + } + while(exp < 0) + { + arg /= 2; + ++exp; + } + + return arg; +} + + ///Class to hold a reproducible summation of the numbers passed to it + /// + ///@param ftype Floating-point data type; either `float` or `double` + ///@param FOLD The fold; use 3 as a default unless you understand it. + template ::value>::type* = + nullptr> + class ReproducibleFloatingAccumulator + { + private: + std::array data = {0}; + + ///Floating-point precision bin width + static constexpr auto BIN_WIDTH = + std::is_same::value ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + ///Binned floating-point compression factor + ///This factor is used to scale down inputs before deposition into the bin of + ///highest index + static constexpr auto COMPRESSION = + 1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1)); + ///Binned double precision expansion factor + ///This factor is used to scale up inputs after deposition into the bin of + ///highest index + static constexpr auto EXPANSION = + 1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1)); + static constexpr auto EXP_BIAS = MAX_EXP - 2; + static constexpr auto EPSILON = std::numeric_limits::epsilon(); + ///Binned floating-point deposit endurance + ///The number of deposits that can be performed before a renorm is necessary. + ///Applies also to binned complex double precision. + static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); + + ///Generates binned floating-point reference bins + static constexpr std::array initialize_bins() + { + std::array bins{0}; + + if (std::is_same::value) + { + bins[0] = ldexp_impl(0.75, MAX_EXP); + } + else + { + bins[0] = 2.0 * ldexp_impl(0.75, MAX_EXP - 1); + } + + for (int index = 1; index <= MAXINDEX; index++) + { + bins[index] = ldexp_impl(0.75, + MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH); + } + for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++) + { + bins[index] = bins[index - 1]; + } + + return bins; + } + + ///The binned floating-point reference bins + static constexpr auto bins = initialize_bins(); + + ///Return a binned floating-point reference bin + static inline constexpr const ftype* binned_bins(const int x) + { + return &bins[x]; + } + + ///Get the bit representation of a float + static inline uint32_t& get_bits(float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a double + static inline uint64_t& get_bits(double& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const float + static inline uint32_t get_bits(const float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const double + static inline uint64_t get_bits(const double& x) + { + return *reinterpret_cast(&x); + } + + ///Return a pointer to the primary vector + inline ftype* pvec() + { + return &data[0]; + } + ///Return a pointer to the carry vector + inline ftype* cvec() + { + return &data[FOLD]; + } + ///Return a const pointer to the primary vector + inline const ftype* pvec() const + { + return &data[0]; + } + ///Return a const pointer to the carry vector + inline const ftype* cvec() const + { + return &data[FOLD]; + } + + static inline constexpr int ISNANINF(const ftype x) + { + const auto bits = get_bits(x); + return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) == + ((2ull * MAX_EXP - 1) << (MANT_DIG - 1)); + } + + static inline constexpr int EXP(const ftype x) + { + const auto bits = get_bits(x); + return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1); + } + + ///Get index of float-point precision + ///The index of a non-binned type is the smallest index a binned type would + ///need to have to sum it reproducibly. Higher indicies correspond to smaller + ///bins. + static inline constexpr int binned_dindex(const ftype x) + { + int exp = EXP(x); + if (exp == 0) + { + if (x == 0.0) + { + return MAXINDEX; + } + else + { + frexp(x, &exp); + return std::min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + } + } + return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; + } + + ///Get index of manually specified binned double precision + ///The index of a binned type is the bin that it corresponds to. Higher + ///indicies correspond to smaller bins. + inline int binned_index() const + { + return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - + EXP(pvec()[0])) / + BIN_WIDTH; + } + + ///Check if index of manually specified binned floating-point is 0 + ///A quick check to determine if the index is 0 + inline bool binned_index0() const + { + return EXP(pvec()[0]) == MAX_EXP + EXP_BIAS; + } + + ///Update manually specified binned fp with a scalar (X -> Y) + /// + ///This method updates the binned fp to an index suitable for adding numbers + ///with absolute value less than @p max_abs_val + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdupdate( + const ftype max_abs_val, const int incpriY, const int inccarY) + { + int i; + int j; + int X_index; + int shift; + auto* const priY = pvec(); + auto* const carY = cvec(); + + if (ISNANINF(priY[0])) + { + return; + } + + X_index = binned_dindex(max_abs_val); + if (priY[0] == 0.0) + { + const ftype* const bins = binned_bins(X_index); + for (i = 0; i < FOLD; i++) + { + priY[i * incpriY] = bins[i]; + carY[i * inccarY] = 0.0; + } + } + else + { + shift = binned_index() - X_index; + if (shift > 0) + { + for (i = FOLD - 1; i >= shift; i--) + { + priY[i * incpriY] = priY[(i - shift) * incpriY]; + carY[i * inccarY] = carY[(i - shift) * inccarY]; + } + const ftype* const bins = binned_bins(X_index); + for (j = 0; j < i + 1; j++) + { + priY[j * incpriY] = bins[j]; + carY[j * inccarY] = 0.0; + } + } + } + } + + ///Add scalar @p X to suitably binned manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y where the index of Y is + ///larger than the index of @p X + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + void binned_dmddeposit(const ftype X, const int incpriY) + { + ftype M; + int i; + ftype x = X; + auto* const priY = pvec(); + + if (ISNANINF(x) || ISNANINF(priY[0])) + { + priY[0] += x; + return; + } + + if (binned_index0()) + { + M = priY[0]; + ftype qd = x * COMPRESSION; + auto& ql = get_bits(qd); + ql |= 1; + qd += M; + priY[0] = qd; + M -= qd; + M *= EXPANSION * 0.5; + x += M; + x += M; + for (i = 1; i < FOLD - 1; i++) + { + M = priY[i * incpriY]; + qd = x; + ql |= 1; + qd += M; + priY[i * incpriY] = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + priY[i * incpriY] += qd; + } + else + { + ftype qd = x; + auto& ql = get_bits(qd); + for (i = 0; i < FOLD - 1; i++) + { + M = priY[i * incpriY]; + qd = x; + ql |= 1; + qd += M; + priY[i * incpriY] = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + priY[i * incpriY] += qd; + } + } + + ///Renormalize manually specified binned double precision + /// + ///Renormalization keeps the primary vector within the necessary bins by + ///shifting over to the carry vector + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + void binned_dmrenorm(const int incpriX, const int inccarX) + { + auto* priX = pvec(); + auto* carX = cvec(); + + if (priX[0] == 0.0 || ISNANINF(priX[0])) + { + return; + } + + for (int i = 0; i < FOLD; i++, priX += incpriX, carX += inccarX) + { + auto tmp_renormd = priX[0]; + auto& tmp_renorml = get_bits(tmp_renormd); + + carX[0] += (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2; + + tmp_renorml &= ~(1ull << (MANT_DIG - 3)); + tmp_renorml |= 1ull << (MANT_DIG - 2); + priX[0] = tmp_renormd; + } + } + + ///Add scalar to manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdadd(const ftype X, const int incpriY, const int inccarY) + { + binned_dmdupdate(X, incpriY, inccarY); + binned_dmddeposit(X, incpriY); + binned_dmrenorm(incpriY, inccarY); + } + + ///Convert manually specified binned fp to native double-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + double binned_conv_double(const int incpriX, const int inccarX) const + { + int i = 0; + + const auto* const priX = pvec(); + const auto* const carX = cvec(); + + if (ISNANINF(priX[0])) + { + return priX[0]; + } + + if (priX[0] == 0.0) + { + return 0.0; + } + + double Y = 0.0; + double scale_down; + double scale_up; + int scaled; + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index <= (3 * MANT_DIG) / BIN_WIDTH) + { + scale_down = ldexp_impl(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); + scale_up = ldexp_impl(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); + scaled = std::max( + std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); + if (X_index == 0) + { + Y += carX[0] * ((bins[0] / 6.0) * scale_down * EXPANSION); + Y += carX[inccarX] * ((bins[1] / 6.0) * scale_down); + Y += (priX[0] - bins[0]) * scale_down * EXPANSION; + i = 2; + } + else + { + Y += carX[0] * ((bins[0] / 6.0) * scale_down); + i = 1; + } + for (; i < scaled; i++) + { + Y += carX[i * inccarX] * ((bins[i] / 6.0) * scale_down); + Y += (priX[(i - 1) * incpriX] - bins[i - 1]) * scale_down; + } + if (i == FOLD) + { + Y += (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]) * + scale_down; + return Y * scale_up; + } + if (std::isinf(Y * scale_up)) + { + return Y * scale_up; + } + Y *= scale_up; + for (; i < FOLD; i++) + { + Y += carX[i * inccarX] * (bins[i] / 6.0); + Y += priX[(i - 1) * incpriX] - bins[i - 1]; + } + Y += priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]; + } + else + { + Y += carX[0] * (bins[0] / 6.0); + for (i = 1; i < FOLD; i++) + { + Y += carX[i * inccarX] * (bins[i] / 6.0); + Y += (priX[(i - 1) * incpriX] - bins[i - 1]); + } + Y += (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]); + } + return Y; + } + + ///Convert manually specified binned fp to native single-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + float binned_conv_single(const int incpriX, const int inccarX) const + { + int i = 0; + double Y = 0.0; + const auto* const priX = pvec(); + const auto* const carX = cvec(); + + if (ISNANINF(priX[0])) + { + return priX[0]; + } + + if (priX[0] == 0.0) + { + return 0.0; + } + + //Note that the following order of summation is in order of decreasing + //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and + //the number of carries equal to 1. + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index == 0) + { + Y += (double) carX[0] * (double) (bins[0] / 6.0) * + (double) EXPANSION; + Y += (double) carX[inccarX] * (double) (bins[1] / 6.0); + Y += (double) (priX[0] - bins[0]) * (double) EXPANSION; + i = 2; + } + else + { + Y += (double) carX[0] * (double) (bins[0] / 6.0); + i = 1; + } + for (; i < FOLD; i++) + { + Y += (double) carX[i * inccarX] * (double) (bins[i] / 6.0); + Y += (double) (priX[(i - 1) * incpriX] - bins[i - 1]); + } + Y += (double) (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]); + + return (float) Y; + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + /// + ///@param other Another binned fp of the same type + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdmadd(const ReproducibleFloatingAccumulator& other, + const int incpriX, const int inccarX, const int incpriY, + const int inccarY) + { + auto* const priX = pvec(); + auto* const carX = cvec(); + auto* const priY = other.pvec(); + auto* const carY = other.cvec(); + + if (priX[0] == 0.0) + return; + + if (priY[0] == 0.0) + { + for (int i = 0; i < FOLD; i++) + { + priY[i * incpriY] = priX[i * incpriX]; + carY[i * inccarY] = carX[i * inccarX]; + } + return; + } + + if (ISNANINF(priX[0]) || ISNANINF(priY[0])) + { + priY[0] += priX[0]; + return; + } + + const auto X_index = binned_index(priX); + const auto Y_index = binned_index(priY); + const auto shift = Y_index - X_index; + if (shift > 0) + { + const auto* const bins = binned_bins(Y_index); + //shift Y upwards and add X to Y + for (int i = FOLD - 1; i >= shift; i--) + { + priY[i * incpriY] = priX[i * incpriX] + + (priY[(i - shift) * incpriY] - bins[i - shift]); + carY[i * inccarY] = + carX[i * inccarX] + carY[(i - shift) * inccarY]; + } + for (int i = 0; i < shift && i < FOLD; i++) + { + priY[i * incpriY] = priX[i * incpriX]; + carY[i * inccarY] = carX[i * inccarX]; + } + } + else + { + const auto* const bins = binned_bins(X_index); + //shift X upwards and add X to Y + for (int i = 0 - shift; i < FOLD; i++) + { + priY[i * incpriY] += + priX[(i + shift) * incpriX] - bins[i + shift]; + carY[i * inccarY] += carX[(i + shift) * inccarX]; + } + } + + binned_dmrenorm(incpriY, inccarY); + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + void binned_dbdbadd(const ReproducibleFloatingAccumulator& other) + { + binned_dmdmadd(other, 1, 1, 1, 1); + } + + public: + ///Set the binned fp to zero + void zero() + { + data = {0}; + } + + ///Return the fold of the binned fp + int fold() const + { + return FOLD; + } + + ///Returns the number of reference bins. Used for judging memory usage. + static constexpr size_t number_of_reference_bins() + { + return bins.size(); + } + + ///Accumulate an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template ::value>::type* = + nullptr> + ReproducibleFloatingAccumulator& operator+=(const U x) + { + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Accumulate-subtract an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template ::value>::type* = + nullptr> + ReproducibleFloatingAccumulator& operator-=(const U x) + { + binned_dmdadd(-static_cast(x), 1, 1); + return *this; + } + + ///Accumulate a binned fp @p x into the binned fp. + ReproducibleFloatingAccumulator& operator+=( + const ReproducibleFloatingAccumulator& other) + { + binned_dbdbadd(other); + return *this; + } + + ///Accumulate-subtract a binned fp @p x into the binned fp. + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator& operator-=( + const ReproducibleFloatingAccumulator& other) + { + const auto temp = -other; + binned_dbdbadd(temp); + } + + ///Determines if two binned fp are equal + bool operator==(const ReproducibleFloatingAccumulator& other) const + { + return data == other.data; + } + + ///Determines if two binned fp are not equal + bool operator!=(const ReproducibleFloatingAccumulator& other) const + { + return !operator==(other); + } + + ///Sets this binned fp equal to the arithmetic value @p x + ///NOTE: Casts @p x to the type of the binned fp + template ::value>::type* = + nullptr> + ReproducibleFloatingAccumulator& operator=(const U x) + { + zero(); + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Sets this binned fp equal to another binned fp + ReproducibleFloatingAccumulator& operator=( + const ReproducibleFloatingAccumulator& o) + { + data = o.data; + return *this; + } + + ///Returns the negative of this binned fp + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator operator-() + { + constexpr int incpriX = 1; + constexpr int inccarX = 1; + ReproducibleFloatingAccumulator temp = *this; + if (pvec()[0] != 0.0) + { + const auto* const bins = binned_bins(binned_index()); + for (int i = 0; i < FOLD; i++) + { + temp.pvec()[i * incpriX] = + bins[i] - (pvec()[i * incpriX] - bins[i]); + temp.cvec()[i * inccarX] = -cvec()[i * inccarX]; + } + } + return temp; + } + + ///Convert this binned fp into its native floating-point representation + ftype conv() const + { + if (std::is_same::value) + { + return binned_conv_single(1, 1); + } + else + { + return binned_conv_double(1, 1); + } + } + + ///@brief Get binned fp summation error bound + /// + ///This is a bound on the absolute error of a summation using binned types + /// + ///@param N The number of single precision floating point summands + ///@param max_abs_val The summand of maximum absolute value + ///@param binned_sum The value of the sum computed using binned types + ///@return The absolute error bound + static constexpr ftype error_bound( + const uint64_t N, const ftype max_abs_val, const ftype binned_sum) + { + const double X = std::abs(max_abs_val); + const double S = std::abs(binned_sum); + return static_cast(std::max(X, ldexp_impl(0.5, MIN_EXP - 1)) * + ldexp_impl(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + + ((7.0 * EPSILON) / + (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - + 7.0 * EPSILON)) * + S); + } + + ///Add @p x to the binned fp + void add(const ftype x) + { + binned_dmdadd(x, 1, 1); + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///@param first Start of range + ///@param last End of range + ///@param max_abs_val Maximum absolute value of any member of the range + template + void add(InputIt first, InputIt last, const ftype max_abs_val) + { + binned_dmdupdate(std::abs(max_abs_val), 1, 1); + size_t count = 0; + for (; first != last; first++, count++) + { + binned_dmddeposit(static_cast(*first), 1); + if (count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param first Start of range + ///@param last End of range + template + void add(InputIt first, InputIt last) + { + const auto max_abs_val = *std::max_element( + first, last, [](const auto& a, const auto& b) { + return std::abs(a) < std::abs(b); + }); + add(first, last, static_cast(max_abs_val)); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///@param input Start of the range + ///@param N Number of elements to add + ///@param max_abs_val Maximum absolute value of any member of the range + template ::value>::type* = + nullptr> + void add(const T* input, const size_t N, const ftype max_abs_val) + { + if (N == 0) + { + return; + } + add(input, input + N, max_abs_val); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param input Start of the range + ///@param N Number of elements to add + template ::value>::type* = + nullptr> + void add(const T* input, const size_t N) + { + if (N == 0) + { + return; + } + T max_abs_val = input[0]; + for (size_t i = 0; i < N; i++) + { + max_abs_val = std::max(max_abs_val, std::abs(input[i])); + } + add(input, N, max_abs_val); + } + + ////////////////////////////////////// + //MANUAL OPERATIONS; USE WISELY + ////////////////////////////////////// + + ///Rebins for repeated accumulation of scalars with magnitude <= @p mav + /// + ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator + ///with `unsafe_add` after which `renorm()` must be called. See the source of + ///`add()` for an example + template ::value>::type* = + nullptr> + void set_max_abs_val(const T mav) + { + binned_dmdupdate(std::abs(mav), 1, 1); + } + + ///Add @p x to the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` + void unsafe_add(const ftype x) + { + binned_dmddeposit(x, 1); + } + + ///Renormalizes the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` and one or + ///more calls to `unsafe_add()` + void renorm() + { + binned_dmrenorm(1, 1); + } + }; +} // namespace RFA diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp new file mode 100644 index 000000000000..56c8d3349898 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -0,0 +1,569 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +/// \file parallel/algorithms/reduce_deterministic.hpp +/// \page hpx::reduce_deterministic +/// \headerfile hpx/algorithm.hpp + +#pragma once + +#if defined(DOXYGEN) + +namespace hpx { + + // clang-format off + + /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// predicate \a f. + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// \tparam F The type of the function/function object to use + /// (deduced). Unlike its sequential form, the parallel + /// overload of \a reduce requires \a F to meet the + /// requirements of \a CopyConstructible. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// \param f Specifies the function (or function object) which + /// will be invoked for each of the elements in the + /// sequence specified by [first, last). This is a + /// binary predicate. The signature of this predicate + /// should be equivalent to: + /// \code + /// Ret fun(const Type1 &a, const Type1 &b); + /// \endcode \n + /// The signature does not need to have const&. + /// The types \a Type1 \a Ret must be + /// such that an object of type \a FwdIter can be + /// dereferenced and then implicitly converted to any + /// of those types. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a copy_if algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns \a T otherwise. + /// The \a reduce algorithm returns the result of the + /// generalized sum over the elements given by the input range + /// [first, last). + /// + /// \note GENERALIZED_SUM(op, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + hpx::parallel::util::detail::algorithm_result_t + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init, F&& f); + + /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a copy_if algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns \a T otherwise. + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + util::detail::algorithm_result_t + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init); + + /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns T otherwise (where T is the value_type of + /// \a FwdIter). + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note The type of the initial value (and the result type) \a T is + /// determined from the value_type of the used \a FwdIter. + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template + typename hpx::parallel::util::detail::algorithm_result::value_type + >::type + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last); + + /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// predicate \a f. + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// \tparam F The type of the function/function object to use + /// (deduced). Unlike its sequential form, the parallel + /// overload of \a reduce requires \a F to meet the + /// requirements of \a CopyConstructible. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// \param f Specifies the function (or function object) which + /// will be invoked for each of the elements in the + /// sequence specified by [first, last). This is a + /// binary predicate. The signature of this predicate + /// should be equivalent to: + /// \code + /// Ret fun(const Type1 &a, const Type1 &b); + /// \endcode \n + /// The signature does not need to have const&. + /// The types \a Type1 \a Ret must be + /// such that an object of type \a InIter can be + /// dereferenced and then implicitly converted to any + /// of those types. + /// + /// \returns The \a reduce algorithm returns \a T. + /// The \a reduce algorithm returns the result of the + /// generalized sum over the elements given by the input range + /// [first, last). + /// + /// \note GENERALIZED_SUM(op, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + T reduce_deterministic(FwdIter first, FwdIter last, T init, F&& f); + + /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// + /// \returns The \a reduce algorithm returns a \a T. + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + T reduce_deterministic(FwdIter first, FwdIter last, T init); + + /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// + /// \returns The \a reduce algorithm returns \a T (where T is the + /// value_type of \a FwdIter). + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note The type of the initial value (and the result type) \a T is + /// determined from the value_type of the used \a FwdIter. + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template + typename std::iterator_traits::value_type + reduce_deterministic(FwdIter first, FwdIter last); + // clang-format on +} // namespace hpx + +#else // DOXYGEN + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace hpx::parallel { + + /////////////////////////////////////////////////////////////////////////// + // reduce + namespace detail { + + /// \cond NOINTERNAL + template + struct reduce_deterministic + : public algorithm, T> + { + constexpr reduce_deterministic() noexcept + : algorithm("reduce_deterministic") + { + } + + template + static constexpr T sequential(ExPolicy&& policy, InIterB first, + InIterE last, T_&& init, Reduce&& r) + { + // hpx::parallel::detail::sequential_reduce_deterministic_t seq; + // return seq(policy,first,last,0.0f,r); + return hpx::parallel::detail::sequential_reduce_deterministic( + HPX_FORWARD(ExPolicy, policy), first, last, + HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r)); + } + + // template + // static util::detail::algorithm_result_t parallel( + // ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init, + // Reduce&& r) + // { + // if (first == last) + // { + // return util::detail::algorithm_result::get( + // HPX_FORWARD(T_, init)); + // } + + // auto f1 = [r](FwdIterB part_begin, std::size_t part_size) -> T { + // T val = *part_begin; + // return detail::sequential_reduce( + // ++part_begin, --part_size, HPX_MOVE(val), r); + // }; + + // return util::partitioner::call( + // HPX_FORWARD(ExPolicy, policy), first, + // detail::distance(first, last), HPX_MOVE(f1), + // hpx::unwrapping( + // [init = HPX_FORWARD(T_, init), + // r = HPX_FORWARD(Reduce, r)](auto&& results) -> T { + // return detail::sequential_reduce( + // hpx::util::begin(results), + // hpx::util::size(results), init, r); + // })); + // } + }; + /// \endcond + } // namespace detail +} // namespace hpx::parallel + +namespace hpx { + + /////////////////////////////////////////////////////////////////////////// + // CPO for hpx::reduce + inline constexpr struct reduce_deterministic_t final + : hpx::detail::tag_parallel_algorithm + { + private: + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::is_execution_policy_v && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last, T init, F f) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init), + HPX_MOVE(f)); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::is_execution_policy_v && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last, T init) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init), + std::plus<>{}); + } + + // clang-format off + template && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t::value_type> + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + using value_type = + typename std::iterator_traits::value_type; + + return hpx::parallel::detail::reduce_deterministic() + .call(HPX_FORWARD(ExPolicy, policy), first, last, value_type{}, + std::plus<>{}); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::traits::is_iterator_v + )> + // clang-format on + friend T tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last, T init, F f) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + hpx::execution::seq, first, last, HPX_MOVE(init), HPX_MOVE(f)); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::traits::is_iterator_v + )> + // clang-format on + friend T tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last, T init) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + hpx::execution::seq, first, last, HPX_MOVE(init), + std::plus<>{}); + } + + // clang-format off + template + )> + // clang-format on + friend typename std::iterator_traits::value_type + tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + using value_type = + typename std::iterator_traits::value_type; + + return hpx::parallel::detail::reduce_deterministic() + .call(hpx::execution::seq, first, last, value_type{}, + std::plus<>()); + } + } reduce_deterministic{}; +} // namespace hpx + +#endif // DOXYGEN diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt index 7d3a9204530c..559ee830030e 100644 --- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt @@ -94,6 +94,7 @@ set(tests partition_copy reduce_ reduce_by_key + reduce_deterministic remove remove1 remove2 diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp new file mode 100644 index 000000000000..e2229a992fe4 --- /dev/null +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -0,0 +1,98 @@ +// Copyright (c) 2014-2020 Hartmut Kaiser +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "test_utils.hpp" + +int seed = std::random_device{}(); +std::mt19937 gen(seed); + +/////////////////////////////////////////////////////////////////////////////// +template +void test_reduce1(IteratorTag) +{ + using base_iterator = std::vector::iterator; + using iterator = test::test_iterator; + + std::vector c(10007); + std::iota(std::begin(c), std::end(c), gen()); + + float val(42); + auto op = [](auto v1, auto v2) { return v1 + v2; }; + + int r1 = + hpx::reduce_deterministic(iterator(std::begin(c)), iterator(std::end(c)), val, op); + + // verify values + int r2 = std::accumulate(std::begin(c), std::end(c), val, op); + HPX_TEST_EQ(r1, r2); +} + +template +void test_reduce1() +{ + using namespace hpx::execution; + + test_reduce1(IteratorTag()); +} + +void reduce_test1() +{ + test_reduce1(); + test_reduce1(); +} + + +/////////////////////////////////////////////////////////////////////////////// +int hpx_main(hpx::program_options::variables_map& vm) +{ + unsigned int seed = (unsigned int) std::time(nullptr); + if (vm.count("seed")) + seed = vm["seed"].as(); + + std::cout << "using seed: " << seed << std::endl; + gen.seed(seed); + + reduce_test1(); + + return hpx::local::finalize(); +} + +int main(int argc, char* argv[]) +{ + // add command line option which controls the random number generator seed + using namespace hpx::program_options; + options_description desc_commandline( + "Usage: " HPX_APPLICATION_STRING " [options]"); + + desc_commandline.add_options()("seed,s", value(), + "the random number generator seed to use for this run"); + // By default this test should run on all available cores + std::vector const cfg = {"hpx.os_threads=all"}; + + // Initialize and run HPX + hpx::local::init_params init_args; + init_args.desc_cmdline = desc_commandline; + init_args.cfg = cfg; + + HPX_TEST_EQ_MSG(hpx::local::init(hpx_main, argc, argv, init_args), 0, + "HPX main exited with non-zero status"); + + return hpx::util::report_errors(); +} From 7e070e9a80c14b6d6fcd291de5ee9379039773de Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Wed, 4 Dec 2024 00:51:55 -0600 Subject: [PATCH 4/9] Fix some issues and add tests for determinism - Add Kate's CUDA impl for RFA TODO: - Use original RFA instead of Kate's - Make a parallel version out of it - Make a partition vector suitable version Signed-off-by: Shreyas Atre --- cmake/HPX_AddConfigTest.cmake | 2 +- .../detail/reduce_deterministic.hpp | 22 +- .../hpx/parallel/algorithms/detail/rfa.hpp | 63 +- .../parallel/algorithms/detail/rfa_cuda.hpp | 1168 +++++++++++++++++ .../tests/unit/algorithms/CMakeLists.txt | 4 + .../unit/algorithms/reduce_deterministic.cpp | 208 ++- 6 files changed, 1420 insertions(+), 47 deletions(-) create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp diff --git a/cmake/HPX_AddConfigTest.cmake b/cmake/HPX_AddConfigTest.cmake index 92b24ceec194..b0e1ea437452 100644 --- a/cmake/HPX_AddConfigTest.cmake +++ b/cmake/HPX_AddConfigTest.cmake @@ -408,7 +408,7 @@ function(hpx_check_for_cxx11_std_quick_exit) add_hpx_config_test( HPX_WITH_CXX11_STD_QUICK_EXIT SOURCE cmake/tests/cxx11_std_quick_exit.cpp - FILE ${ARGN} + FILE EXECUTE ${ARGN} ) endfunction() diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index 79f78b8a858a..acf0cc687f56 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -9,10 +9,11 @@ #include #include #include -#include +#include #include #include +#include #include #include #include "rfa.hpp" @@ -31,9 +32,22 @@ namespace hpx::parallel::detail { sequential_reduce_deterministic_t, ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r) { - RFA::ReproducibleFloatingAccumulator rfa; - rfa += init; - rfa.add(first, last); + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + + hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + rfa.set_max_abs_val(init); + rfa.unsafe_add(init); + rfa.renorm(); + // rfa.add(first, last); + for (auto e = first; e != last; ++e) + { + // printf("%f \n",*e); + rfa.set_max_abs_val(*e); + rfa.unsafe_add(*e); + rfa.renorm(); + } return rfa.conv(); } }; diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp index e7413add8e7e..340f4cc7b2ef 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -29,21 +29,26 @@ namespace RFA { template -inline constexpr Real ldexp_impl(Real arg, int exp) noexcept -{ - while(exp > 0) + inline constexpr Real ldexp_impl(Real arg, int exp) noexcept { - arg *= 2; - --exp; + return std::ldexp(arg, exp); + // while (arg == 0) + // { + // return arg; + // } + // while (exp > 0) + // { + // arg *= static_cast(2); + // --exp; + // } + // while (exp < 0) + // { + // arg /= static_cast(2); + // ++exp; + // } + + // return arg; } - while(exp < 0) - { - arg /= 2; - ++exp; - } - - return arg; -} ///Class to hold a reproducible summation of the numbers passed to it /// @@ -54,7 +59,7 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept nullptr> class ReproducibleFloatingAccumulator { - private: + public: std::array data = {0}; ///Floating-point precision bin width @@ -86,7 +91,8 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept ///The number of deposits that can be performed before a renorm is necessary. ///Applies also to binned complex double precision. static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); - + // static_assert(ENDURANCE == 0); + public: ///Generates binned floating-point reference bins static constexpr std::array initialize_bins() { @@ -115,8 +121,9 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept } ///The binned floating-point reference bins - static constexpr auto bins = initialize_bins(); + static std::array bins; + private: ///Return a binned floating-point reference bin static inline constexpr const ftype* binned_bins(const int x) { @@ -519,10 +526,10 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept const int incpriX, const int inccarX, const int incpriY, const int inccarY) { - auto* const priX = pvec(); - auto* const carX = cvec(); - auto* const priY = other.pvec(); - auto* const carY = other.cvec(); + auto* const priX = other.pvec(); + auto* const carX = other.cvec(); + auto* const priY = pvec(); + auto* const carY = cvec(); if (priX[0] == 0.0) return; @@ -543,8 +550,9 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept return; } - const auto X_index = binned_index(priX); - const auto Y_index = binned_index(priY); + // const auto X_index = binned_index(priX); + const auto X_index = other.binned_index(); + const auto Y_index = binned_index(); const auto shift = Y_index - X_index; if (shift > 0) { @@ -721,7 +729,8 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept { const double X = std::abs(max_abs_val); const double S = std::abs(binned_sum); - return static_cast(std::max(X, ldexp_impl(0.5, MIN_EXP - 1)) * + return static_cast( + std::max(X, ldexp_impl(0.5, MIN_EXP - 1)) * ldexp_impl(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + ((7.0 * EPSILON) / (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - @@ -848,4 +857,12 @@ inline constexpr Real ldexp_impl(Real arg, int exp) noexcept binned_dmrenorm(1, 1); } }; + + template <> + auto ReproducibleFloatingAccumulator::bins = + ReproducibleFloatingAccumulator::initialize_bins(); + + template <> + auto ReproducibleFloatingAccumulator::bins = + ReproducibleFloatingAccumulator::initialize_bins(); } // namespace RFA diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp new file mode 100644 index 000000000000..05f71d9ae746 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp @@ -0,0 +1,1168 @@ +//Reproducible Floating Point Accumulations via Binned Floating Point +//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. +//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. +// +//The code accomplishes several objectives: +// +//1. Reproducible summation, independent of summation order, assuming only a +// subset of the IEEE 754 Floating Point Standard +// +//2. Has accuracy at least as good as conventional summation, and tunable +// +//3. Handles overflow, underflow, and other exceptions reproducibly. +// +//4. Makes only one read-only pass over the summands. +// +//5. Requires only one parallel reduction. +// +//6. Uses minimal memory (6 doubles per accumulator with fold=3). +// +//7. Relatively easy to use + +#pragma once + +#include +#include +#include +#include +#include + +#ifndef __CUDACC__ +#define __host__ +#define __device__ +#define __forceinline__ +#include +using std::array; +using std::max; +using std::min; +#else +#include +using cuda::std::array; +using cuda::std::max; +using cuda::std::min; +#include "vector.hpp" +#endif + +namespace hpx::parallel::detail::rfa { + template + struct type4 + { + F x; + F y; + F z; + F w; + }; + + template + struct type2 + { + F x; + F y; + }; + using float4 = type4; + using double4 = type4; + using float2 = type2; + using double2 = type2; + + auto abs_max(float4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(float2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + +// disable zero checks +#define DISABLE_ZERO + +// disable nan / infinity checks +#define DISABLE_NANINF + +// jump table for indexing into data +#define MAX_JUMP 5 + static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max"); + + template + inline constexpr Real ldexp_impl(Real arg, int exp) noexcept + { + return std::ldexp(arg, exp); + // while (arg == 0) + // { + // return arg; + // } + // while (exp > 0) + // { + // arg *= static_cast(2); + // --exp; + // } + // while (exp < 0) + // { + // arg /= static_cast(2); + // ++exp; + // } + + // return arg; + } + + template + struct RFA_bins + { + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + + ///The binned floating-point reference bins + array bins = {}; + + constexpr ftype& operator[](int d) + { + return bins[d]; + } + + void initialize_bins() + { + if constexpr (std::is_same_v) + { + bins[0] = std::ldexp(0.75, MAX_EXP); + } + else + { + bins[0] = 2.0 * ldexp(0.75, MAX_EXP - 1); + } + + for (int index = 1; index <= MAXINDEX; index++) + { + bins[index] = ldexp(0.75, + MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH); + } + for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++) + { + bins[index] = bins[index - 1]; + } + } + }; + + static char bin_host_buffer[sizeof(RFA_bins)]; +#ifdef __CUDACC__ + __constant__ static char bin_device_buffer[sizeof(RFA_bins)]; +#endif + + ///Class to hold a reproducible summation of the numbers passed to it + /// + ///@param ftype Floating-point data type; either `float` or `double + ///@param FOLD The fold; use 3 as a default unless you understand it. + template ::value>* = + nullptr> + class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator + { + public: + using ftype = ftype_; + static constexpr int FOLD = FOLD_; + + private: + array data = {0}; + + ///Floating-point precision bin width + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + ///Binned floating-point compression factor + ///This factor is used to scale down inputs before deposition into the bin of + ///highest index + static constexpr auto COMPRESSION = + 1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1)); + ///Binned double precision expansion factor + ///This factor is used to scale up inputs after deposition into the bin of + ///highest index + static constexpr auto EXPANSION = + 1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1)); + static constexpr auto EXP_BIAS = MAX_EXP - 2; + static constexpr auto EPSILON = std::numeric_limits::epsilon(); + ///Binned floating-point deposit endurance + ///The number of deposits that can be performed before a renorm is necessary. + ///Applies also to binned complex double precision. + static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); + + ///Return a binned floating-point reference bin + inline const ftype* binned_bins(const int x) const + { +#ifdef __CUDA_ARCH__ // must be arch not CC here + return &reinterpret_cast&>(bin_device_buffer)[x]; +#else + return &reinterpret_cast&>(bin_host_buffer)[x]; +#endif + } + + ///Get the bit representation of a float + static inline uint32_t& get_bits(float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a double + static inline uint64_t& get_bits(double& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const float + static inline uint32_t get_bits(const float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const double + static inline uint64_t get_bits(const double& x) + { + return *reinterpret_cast(&x); + } + + ///Return primary vector value const ref + inline const ftype& primary(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[0]; + case 1: + if constexpr (FOLD >= 2) + return data[1]; + case 2: + if constexpr (FOLD >= 3) + return data[2]; + case 3: + if constexpr (FOLD >= 4) + return data[3]; + case 4: + if constexpr (FOLD >= 5) + return data[4]; + default: + return data[FOLD - 1]; + } + } + else + { + return data[i]; + } + } + + ///Return carry vector value const ref + inline const ftype& carry(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[FOLD + 0]; + case 1: + if constexpr (FOLD >= 2) + return data[FOLD + 1]; + case 2: + if constexpr (FOLD >= 3) + return data[FOLD + 2]; + case 3: + if constexpr (FOLD >= 4) + return data[FOLD + 3]; + case 4: + if constexpr (FOLD >= 5) + return data[FOLD + 4]; + default: + return data[2 * FOLD - 1]; + } + } + else + { + return data[FOLD + i]; + } + } + + ///Return primary vector value ref + inline ftype& primary(int i) + { + const auto& c = *this; + return const_cast(c.primary(i)); + } + + ///Return carry vector value ref + inline ftype& carry(int i) + { + const auto& c = *this; + return const_cast(c.carry(i)); + } + +#ifdef DISABLE_ZERO + static inline constexpr bool ISZERO(const ftype) + { + return false; + } +#else + static inline constexpr bool ISZERO(const ftype x) + { + return x == 0.0; + } +#endif + +#ifdef DISABLE_NANINF + static inline constexpr int ISNANINF(const ftype) + { + return false; + } +#else + static inline constexpr int ISNANINF(const ftype x) + { + const auto bits = get_bits(x); + return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) == + ((2ull * MAX_EXP - 1) << (MANT_DIG - 1)); + } +#endif + + static inline constexpr int EXP(const ftype x) + { + const auto bits = get_bits(x); + return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1); + } + + ///Get index of float-point precision + ///The index of a non-binned type is the smallest index a binned type would + ///need to have to sum it reproducibly. Higher indicies correspond to smaller + ///bins. + static inline constexpr int binned_dindex(const ftype x) + { + int exp = EXP(x); + if (exp == 0) + { + if (x == 0.0) + { + return MAXINDEX; + } + else + { + frexp(x, &exp); + return min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + } + } + return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; + } + + ///Get index of manually specified binned double precision + ///The index of a binned type is the bin that it corresponds to. Higher + ///indicies correspond to smaller bins. + inline int binned_index() const + { + return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - + EXP(primary(0))) / + BIN_WIDTH; + } + + ///Check if index of manually specified binned floating-point is 0 + ///A quick check to determine if the index is 0 + inline bool binned_index0() const + { + return EXP(primary(0)) == MAX_EXP + EXP_BIAS; + } + + ///Update manually specified binned fp with a scalar (X -> Y) + /// + ///This method updates the binned fp to an index suitable for adding numbers + ///with absolute value less than @p max_abs_val + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdupdate( + const ftype max_abs_val, const int incpriY, const int inccarY) + { + if (ISNANINF(primary(0))) + return; + + int X_index = binned_dindex(max_abs_val); + if (ISZERO(primary(0))) + { + const ftype* const bins = binned_bins(X_index); + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) = bins[i]; + carry(i * inccarY) = 0.0; + } + } + else + { + int shift = binned_index() - X_index; + if (shift > 0) + { +#pragma unroll + for (int i = FOLD - 1; i >= 1; i--) + { + if (i < shift) + break; + primary(i * incpriY) = primary((i - shift) * incpriY); + carry(i * inccarY) = carry((i - shift) * inccarY); + } + const ftype* const bins = binned_bins(X_index); +#pragma unroll + for (int j = 0; j < FOLD; j++) + { + if (j >= shift) + break; + primary(j * incpriY) = bins[j]; + carry(j * inccarY) = 0.0; + } + } + } + } + + ///Add scalar @p X to suitably binned manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y where the index of Y is + ///larger than the index of @p X + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + void binned_dmddeposit(const ftype X, const int incpriY) + { + ftype M; + ftype x = X; + + if (ISNANINF(x) || ISNANINF(primary(0))) + { + primary(0) += x; + return; + } + + if (binned_index0()) + { + M = primary(0); + ftype qd = x * COMPRESSION; + auto& ql = get_bits(qd); + ql |= 1; + qd += M; + primary(0) = qd; + M -= qd; + M *= EXPANSION * 0.5; + x += M; + x += M; +#pragma unroll + for (int i = 1; i < FOLD - 1; i++) + { + M = primary(i * incpriY); + qd = x; + ql |= 1; + qd += M; + primary(i * incpriY) = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + primary((FOLD - 1) * incpriY) += qd; + } + else + { + ftype qd = x; + auto& ql = get_bits(qd); +#pragma unroll + for (int i = 0; i < FOLD - 1; i++) + { + M = primary(i * incpriY); + qd = x; + ql |= 1; + qd += M; + primary(i * incpriY) = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + primary((FOLD - 1) * incpriY) += qd; + } + } + + ///Renormalize manually specified binned double precision + /// + ///Renormalization keeps the primary vector within the necessary bins by + ///shifting over to the carry vector + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + inline void binned_dmrenorm(const int incpriX, const int inccarX) + { + if (ISZERO(primary(0)) || ISNANINF(primary(0))) + return; + + for (int i = 0; i < FOLD; i++) + { + auto tmp_renormd = primary(i * incpriX); + auto& tmp_renorml = get_bits(tmp_renormd); + + carry(i * inccarX) += + (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2; + + tmp_renorml &= ~(1ull << (MANT_DIG - 3)); + tmp_renorml |= 1ull << (MANT_DIG - 2); + primary(i * incpriX) = tmp_renormd; + } + } + + ///Add scalar to manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdadd(const ftype X, const int incpriY, const int inccarY) + { + binned_dmdupdate(X, incpriY, inccarY); + binned_dmddeposit(X, incpriY); + binned_dmrenorm(incpriY, inccarY); + } + + ///Convert manually specified binned fp to native double-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + double binned_conv_double(const int incpriX, const int inccarX) const + { + int i = 0; + + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) + return 0.0; + + double Y = 0.0; + double scale_down; + double scale_up; + int scaled; + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index <= (3 * MANT_DIG) / BIN_WIDTH) + { + scale_down = ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); + scale_up = ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); + scaled = + max(min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); + if (X_index == 0) + { + Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); + Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); + Y += (primary(0) - bins[0]) * scale_down * EXPANSION; + i = 2; + } + else + { + Y += carry(0) * ((bins[0] / 6.0) * scale_down); + i = 1; + } + for (; i < scaled; i++) + { + Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); + Y += + (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; + } + if (i == FOLD) + { + Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * + scale_down; + return Y * scale_up; + } + if (std::isinf(Y * scale_up)) + { + return Y * scale_up; + } + Y *= scale_up; + for (; i < FOLD; i++) + { + Y += carry(i * inccarX) * (bins[i] / 6.0); + Y += primary((i - 1) * incpriX) - bins[i - 1]; + } + Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; + } + else + { + Y += carry(0) * (bins[0] / 6.0); + for (i = 1; i < FOLD; i++) + { + Y += carry(i * inccarX) * (bins[i] / 6.0); + Y += (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + } + return Y; + } + + ///Convert manually specified binned fp to native single-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + float binned_conv_single(const int incpriX, const int inccarX) const + { + int i = 0; + double Y = 0.0; + + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) + return 0.0; + + //Note that the following order of summation is in order of decreasing + //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and + //the number of carries equal to 1. + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index == 0) + { + Y += (double) carry(0) * (double) (bins[0] / 6.0) * + (double) EXPANSION; + Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); + Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; + i = 2; + } + else + { + Y += (double) carry(0) * (double) (bins[0] / 6.0); + i = 1; + } + for (; i < FOLD; i++) + { + Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + + return (float) Y; + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + /// + ///@param other Another binned fp of the same type + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdmadd(const ReproducibleFloatingAccumulator& x, + const int incpriX, const int inccarX, const int incpriY, + const int inccarY) + { + if (ISZERO(x.primary(0))) + return; + + if (ISZERO(primary(0))) + { + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); + } + return; + } + + if (ISNANINF(x.primary(0)) || ISNANINF(primary(0))) + { + primary(0) += x.primary(0); + return; + } + + const auto X_index = x.binned_index(); + const auto Y_index = this->binned_index(); + const auto shift = Y_index - X_index; + if (shift > 0) + { + const auto* const bins = binned_bins(Y_index); + //shift Y upwards and add X to Y +#pragma unroll + for (int i = FOLD - 1; i >= 1; i--) + { + if (i < shift) + break; + primary(i * incpriY) = x.primary(i * incpriX) + + (primary((i - shift) * incpriY) - bins[i - shift]); + carry(i * inccarY) = + x.carry(i * inccarX) + carry((i - shift) * inccarY); + } +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + if (i == shift) + break; + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); + } + } + else if (shift < 0) + { + const auto* const bins = binned_bins(X_index); + //shift X upwards and add X to Y +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + if (i < -shift) + continue; + primary(i * incpriY) += + x.primary((i + shift) * incpriX) - bins[i + shift]; + carry(i * inccarY) += x.carry((i + shift) * inccarX); + } + } + else if (shift == 0) + { + const auto* const bins = binned_bins(X_index); + // add X to Y +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; + carry(i * inccarY) += x.carry(i * inccarX); + } + } + + binned_dmrenorm(incpriY, inccarY); + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + void binned_dbdbadd(const ReproducibleFloatingAccumulator& other) + { + binned_dmdmadd(other, 1, 1, 1, 1); + } + + public: + ReproducibleFloatingAccumulator() = default; + ReproducibleFloatingAccumulator( + const ReproducibleFloatingAccumulator&) = default; + ///Sets this binned fp equal to another binned fp + ReproducibleFloatingAccumulator& operator=( + const ReproducibleFloatingAccumulator&) = default; + + ///Set the binned fp to zero + void zero() + { + data = {0}; + } + + ///Return the fold of the binned fp + constexpr int fold() const + { + return FOLD; + } + + ///Return the endurance of the binned fp + constexpr int endurance() const + { + return ENDURANCE; + } + + ///Returns the number of reference bins. Used for judging memory usage. + constexpr size_t number_of_reference_bins() + { + return array::size(); + } + + ///Accumulate an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator+=(const U x) + { + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Accumulate-subtract an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator-=(const U x) + { + binned_dmdadd(-static_cast(x), 1, 1); + return *this; + } + + ///Accumulate a binned fp @p x into the binned fp. + ReproducibleFloatingAccumulator& operator+=( + const ReproducibleFloatingAccumulator& other) + { + binned_dbdbadd(other); + return *this; + } + + ///Accumulate-subtract a binned fp @p x into the binned fp. + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator& operator-=( + const ReproducibleFloatingAccumulator& other) + { + const auto temp = -other; + binned_dbdbadd(temp); + } + + ///Determines if two binned fp are equal + bool operator==(const ReproducibleFloatingAccumulator& other) const + { + return data == other.data; + } + + ///Determines if two binned fp are not equal + bool operator!=(const ReproducibleFloatingAccumulator& other) const + { + return !operator==(other); + } + + ///Sets this binned fp equal to the arithmetic value @p x + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator=(const U x) + { + zero(); + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Returns the negative of this binned fp + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator operator-() + { + constexpr int incpriX = 1; + constexpr int inccarX = 1; + ReproducibleFloatingAccumulator temp = *this; + if (primary(0) != 0.0) + { + const auto* const bins = binned_bins(binned_index()); + for (int i = 0; i < FOLD; i++) + { + temp.primary(i * incpriX) = + bins[i] - (primary(i * incpriX) - bins[i]); + temp.carry(i * inccarX) = -carry(i * inccarX); + } + } + return temp; + } + + ///Convert this binned fp into its native floating-point representation + ftype conv() const + { + if (std::is_same_v) + { + return binned_conv_single(1, 1); + } + else + { + return binned_conv_double(1, 1); + } + } + + ///@brief Get binned fp summation error bound + /// + ///This is a bound on the absolute error of a summation using binned types + /// + ///@param N The number of single precision floating point summands + ///@param max_abs_val The summand of maximum absolute value + ///@param binned_sum The value of the sum computed using binned types + ///@return The absolute error bound + static constexpr ftype error_bound( + const uint64_t N, const ftype max_abs_val, const ftype binned_sum) + { + const double X = std::abs(max_abs_val); + const double S = std::abs(binned_sum); + return static_cast(max(X, ldexp(0.5, MIN_EXP - 1)) * + ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + + ((7.0 * EPSILON) / + (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - + 7.0 * EPSILON)) * + S); + } + + ///Add @p x to the binned fp + void add(const ftype x) + { + binned_dmdadd(x, 1, 1); + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///@param first Start of range + ///@param last End of range + ///@param max_abs_val Maximum absolute value of any member of the range + template + void add(InputIt first, InputIt last, const ftype max_abs_val) + { + binned_dmdupdate(std::abs(max_abs_val), 1, 1); + size_t count = 0; + size_t N = last - first; + for (; first != last; first++, count++) + { + binned_dmddeposit(static_cast(*first), 1); + // first conditional allows compiler to remove the call here when possible + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param first Start of range + ///@param last End of range + template + void add(InputIt first, InputIt last) + { + const auto max_abs_val = *std::max_element( + first, last, [](const auto& a, const auto& b) { + return std::abs(a) < std::abs(b); + }); + add(first, last, static_cast(max_abs_val)); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///@param input Start of the range + ///@param N Number of elements to add + ///@param max_abs_val Maximum absolute value of any member of the range + template >* = nullptr> + void add(const T* input, const size_t N, const ftype max_abs_val) + { + if (N == 0) + return; + add(input, input + N, max_abs_val); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param input Start of the range + ///@param N Number of elements to add + template >* = nullptr> + void add(const T* input, const size_t N) + { + if (N == 0) + return; + + T max_abs_val = input[0]; + for (size_t i = 0; i < N; i++) + { + max_abs_val = max(max_abs_val, std::abs(input[i])); + } + add(input, N, max_abs_val); + } + + ///Accumulate a float4 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float4& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + binned_dmddeposit(static_cast(x.z), 1); + binned_dmddeposit(static_cast(x.w), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const double2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + void add(const float4* input, const size_t N, float max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + binned_dmddeposit(static_cast(input[i].z), 1); + binned_dmddeposit(static_cast(input[i].w), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const double2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const float2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const float4* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const double2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const float2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + ////////////////////////////////////// + //MANUAL OPERATIONS; USE WISELY + ////////////////////////////////////// + + ///Rebins for repeated accumulation of scalars with magnitude <= @p mav + /// + ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator + ///with `unsafe_add` after which `renorm()` must be called. See the source of + ///`add()` for an example + template >* = nullptr> + void set_max_abs_val(const T mav) + { + binned_dmdupdate(std::abs(mav), 1, 1); + } + + ///Add @p x to the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` + void unsafe_add(const ftype x) + { + binned_dmddeposit(x, 1); + } + + ///Renormalizes the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` and one or + ///more calls to `unsafe_add()` + void renorm() + { + binned_dmrenorm(1, 1); + } + }; + + +} // namespace hpx::parallel::detail::rfa \ No newline at end of file diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt index 559ee830030e..76dc5fcd9806 100644 --- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt @@ -246,3 +246,7 @@ foreach(test ${tests}) "modules.algorithms.algorithms" ${test} ${${test}_PARAMETERS} ) endforeach() + +target_compile_options(reduce_deterministic_test PRIVATE -fsanitize=address) + +target_link_options(reduce_deterministic_test PRIVATE -fsanitize=address) \ No newline at end of file diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index e2229a992fe4..462b1473f332 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2014-2020 Hartmut Kaiser +// Copyright (c) 2024 Shreyas Atre // // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying @@ -6,16 +6,19 @@ #pragma once +#include #include +#include +#include #include -#include +#include #include #include #include +#include #include #include -#include #include #include "test_utils.hpp" @@ -23,25 +26,147 @@ int seed = std::random_device{}(); std::mt19937 gen(seed); +template +T get_rand( + T LO = std::numeric_limits::min(), T HI = std::numeric_limits::max()) +{ + return LO + + static_cast(std::rand()) / (static_cast(RAND_MAX / (HI - LO))); +} + /////////////////////////////////////////////////////////////////////////////// -template + +template void test_reduce1(IteratorTag) { - using base_iterator = std::vector::iterator; - using iterator = test::test_iterator; + // check if different type for deterministic and nondeeterministic + // and same result i.e. correct computation + using base_iterator_det = std::vector::iterator; + using iterator_det = test::test_iterator; - std::vector c(10007); - std::iota(std::begin(c), std::end(c), gen()); + using base_iterator_ndet = std::vector::iterator; + using iterator_ndet = test::test_iterator; - float val(42); - auto op = [](auto v1, auto v2) { return v1 + v2; }; + std::vector deterministic(LEN); + std::vector nondeterministic(LEN); - int r1 = - hpx::reduce_deterministic(iterator(std::begin(c)), iterator(std::end(c)), val, op); + std::iota( + deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0)); + + std::iota(nondeterministic.begin(), nondeterministic.end(), + FloatTypeNonDeterministic(0)); + + FloatTypeDeterministic val_det(0); + FloatTypeNonDeterministic val_non_det(0); + auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { + return v1 + v2; + }; + + FloatTypeDeterministic r1 = + hpx::reduce_deterministic(iterator_det(std::begin(deterministic)), + iterator_det(std::end(deterministic)), val_det, op); // verify values - int r2 = std::accumulate(std::begin(c), std::end(c), val, op); - HPX_TEST_EQ(r1, r2); + FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic)), + iterator_ndet(std::end(nondeterministic)), val_non_det, op); + + FloatTypeNonDeterministic r3 = std::accumulate( + nondeterministic.begin(), nondeterministic.end(), val_non_det); + + HPX_TEST_EQ(r1, r3); + HPX_TEST_EQ(r2, r3); +} + +template +void test_reduce_determinism(IteratorTag) +{ + // check if different type for deterministic and nondeeterministic + // and same result + using base_iterator_det = std::vector::iterator; + using iterator_det = test::test_iterator; + + constexpr auto num_bounds_det = + std::is_same_v ? 1000.0f : 1000000.0f; + + std::vector deterministic(LEN); + + for (size_t i = 0; i < LEN; ++i) + { + deterministic[i] = + get_rand(-num_bounds_det, num_bounds_det); + } + + std::vector deterministic_shuffled = deterministic; + + std::shuffle( + deterministic_shuffled.begin(), deterministic_shuffled.end(), gen); + + FloatTypeDeterministic val_det(41.999); + + auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) { + return v1 + v2; + }; + + FloatTypeDeterministic r1 = + hpx::reduce_deterministic(iterator_det(std::begin(deterministic)), + iterator_det(std::end(deterministic)), val_det, op); + + FloatTypeDeterministic r1_shuffled = hpx::reduce_deterministic( + iterator_det(std::begin(deterministic_shuffled)), + iterator_det(std::end(deterministic_shuffled)), val_det, op); + + HPX_TEST_EQ(r1, + r1_shuffled); // Deterministically calculated, should always satisfy +} + +template +void test_orig_reduce_determinism(IteratorTag) +{ + using base_iterator_ndet = std::vector::iterator; + using iterator_ndet = test::test_iterator; + + constexpr auto num_bounds_ndet = + std::is_same_v ? 1000.0f : 1000000.0f; + + std::vector nondeterministic(LEN); + for (size_t i = 0; i < LEN; ++i) + { + nondeterministic[i] = get_rand( + -num_bounds_ndet, num_bounds_ndet); + } + std::vector nondeterministic_shuffled = + nondeterministic; + std::shuffle(nondeterministic_shuffled.begin(), + nondeterministic_shuffled.end(), gen); + + FloatTypeNonDeterministic val_non_det(41.999); + + auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { + return v1 + v2; + }; + + FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic)), + iterator_ndet(std::end(nondeterministic)), val_non_det, op); + FloatTypeNonDeterministic r2_shuffled = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic_shuffled)), + iterator_ndet(std::end(nondeterministic_shuffled)), val_non_det, op); + + FloatTypeNonDeterministic r3 = std::accumulate( + nondeterministic.begin(), nondeterministic.end(), val_non_det); + FloatTypeNonDeterministic r3_shuffled = + std::accumulate(nondeterministic_shuffled.begin(), + nondeterministic_shuffled.end(), val_non_det); + + /// failed around 131 times out of 1000 on macOS arm + /// Floating point addition is not necessarily associative, + /// might fail on an architecture not yet known with much higher precision + HPX_TEST_NEQ(r2, r2_shuffled); + HPX_TEST_NEQ(r3, r3_shuffled); } template @@ -49,25 +174,67 @@ void test_reduce1() { using namespace hpx::execution; - test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); +} + +template +void test_reduce2() +{ + using namespace hpx::execution; + + test_reduce_determinism(IteratorTag()); + test_reduce_determinism(IteratorTag()); +} + +template +void test_reduce3() +{ + using namespace hpx::execution; + + test_orig_reduce_determinism(IteratorTag()); + test_orig_reduce_determinism(IteratorTag()); } void reduce_test1() { test_reduce1(); - test_reduce1(); + test_reduce2(); + test_reduce3(); + // test_reduce1(); } - /////////////////////////////////////////////////////////////////////////////// int hpx_main(hpx::program_options::variables_map& vm) { unsigned int seed = (unsigned int) std::time(nullptr); + bool seed_random = false; + if (vm.count("seed")) + { seed = vm["seed"].as(); + seed_random = true; + } + + if (vm.count("seed-random")) + seed_random = vm["seed-random"].as(); - std::cout << "using seed: " << seed << std::endl; - gen.seed(seed); + if (seed_random) + { + std::cout << "using seed: " << seed << std::endl; + std::cout << "** std::accumulate, hpx::reduce may fail due to " + "non-determinism of the floating summation" + << std::endl; + gen.seed(seed); + std::srand(seed); + } + else + { + gen.seed(223); + std::srand(223); + } reduce_test1(); @@ -83,6 +250,9 @@ int main(int argc, char* argv[]) desc_commandline.add_options()("seed,s", value(), "the random number generator seed to use for this run"); + desc_commandline.add_options()("seed-random", value(), + "switch for the random number generator seed to use for this run"); + // By default this test should run on all available cores std::vector const cfg = {"hpx.os_threads=all"}; From 7f9a3c898bf81a6cc46e7e9a36dc58ac8de2f607 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sat, 7 Dec 2024 00:44:59 -0600 Subject: [PATCH 5/9] Remove unnecessary things from rfa - Also perform renorm and update only when necessary Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 22 +- .../hpx/parallel/algorithms/detail/rfa.hpp | 714 ++++++---- .../parallel/algorithms/detail/rfa_cuda.hpp | 1168 ----------------- .../unit/algorithms/reduce_deterministic.cpp | 2 +- 4 files changed, 512 insertions(+), 1394 deletions(-) delete mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index acf0cc687f56..76f093837def 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include @@ -34,19 +34,29 @@ namespace hpx::parallel::detail { { hpx::parallel::detail::rfa::RFA_bins bins; bins.initialize_bins(); - memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; rfa.set_max_abs_val(init); rfa.unsafe_add(init); rfa.renorm(); - // rfa.add(first, last); + size_t count = 0; + T max_val = std::abs(*first); for (auto e = first; e != last; ++e) { - // printf("%f \n",*e); - rfa.set_max_abs_val(*e); + T temp_max_val = std::abs(static_cast(*e)); + if (max_val < temp_max_val) + { + rfa.set_max_abs_val(temp_max_val); + max_val = temp_max_val; + } rfa.unsafe_add(*e); - rfa.renorm(); + count++; + if (count == rfa.endurance()) + { + rfa.renorm(); + count = 0; + } } return rfa.conv(); } diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp index 340f4cc7b2ef..4a9910cb6faa 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -27,7 +27,73 @@ #include #include -namespace RFA { +namespace hpx::parallel::detail::rfa { + template + struct type4 + { + F x; + F y; + F z; + F w; + }; + + template + struct type2 + { + F x; + F y; + }; + using float4 = type4; + using double4 = type4; + using float2 = type2; + using double2 = type2; + + auto abs_max(float4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(float2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + +// disable zero checks +#define DISABLE_ZERO + +// disable nan / infinity checks +#define DISABLE_NANINF + +// jump table for indexing into data +#define MAX_JUMP 5 + static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max"); + template inline constexpr Real ldexp_impl(Real arg, int exp) noexcept { @@ -50,21 +116,74 @@ namespace RFA { // return arg; } + template + struct RFA_bins + { + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + + ///The binned floating-point reference bins + std::array bins = {}; + + constexpr ftype& operator[](int d) + { + return bins[d]; + } + + void initialize_bins() + { + if constexpr (std::is_same_v) + { + bins[0] = std::ldexp(0.75, MAX_EXP); + } + else + { + bins[0] = 2.0 * std::ldexp(0.75, MAX_EXP - 1); + } + + for (int index = 1; index <= MAXINDEX; index++) + { + bins[index] = std::ldexp(0.75, + MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH); + } + for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++) + { + bins[index] = bins[index - 1]; + } + } + }; + + static char bin_host_buffer[sizeof(RFA_bins)]; + ///Class to hold a reproducible summation of the numbers passed to it /// - ///@param ftype Floating-point data type; either `float` or `double` + ///@param ftype Floating-point data type; either `float` or `double ///@param FOLD The fold; use 3 as a default unless you understand it. - template ::value>::type* = + template ::value>* = nullptr> - class ReproducibleFloatingAccumulator + class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator { public: + using ftype = ftype_; + static constexpr int FOLD = FOLD_; + + private: std::array data = {0}; ///Floating-point precision bin width static constexpr auto BIN_WIDTH = - std::is_same::value ? 40 : 13; + std::is_same_v ? 40 : 13; static constexpr auto MIN_EXP = std::numeric_limits::min_exponent; static constexpr auto MAX_EXP = @@ -91,43 +210,11 @@ namespace RFA { ///The number of deposits that can be performed before a renorm is necessary. ///Applies also to binned complex double precision. static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); - // static_assert(ENDURANCE == 0); - public: - ///Generates binned floating-point reference bins - static constexpr std::array initialize_bins() - { - std::array bins{0}; - if (std::is_same::value) - { - bins[0] = ldexp_impl(0.75, MAX_EXP); - } - else - { - bins[0] = 2.0 * ldexp_impl(0.75, MAX_EXP - 1); - } - - for (int index = 1; index <= MAXINDEX; index++) - { - bins[index] = ldexp_impl(0.75, - MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH); - } - for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++) - { - bins[index] = bins[index - 1]; - } - - return bins; - } - - ///The binned floating-point reference bins - static std::array bins; - - private: ///Return a binned floating-point reference bin - static inline constexpr const ftype* binned_bins(const int x) + inline const ftype* binned_bins(const int x) const { - return &bins[x]; + return &reinterpret_cast&>(bin_host_buffer)[x]; } ///Get the bit representation of a float @@ -151,33 +238,109 @@ namespace RFA { return *reinterpret_cast(&x); } - ///Return a pointer to the primary vector - inline ftype* pvec() + ///Return primary vector value const ref + inline const ftype& primary(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[0]; + case 1: + if constexpr (FOLD >= 2) + return data[1]; + case 2: + if constexpr (FOLD >= 3) + return data[2]; + case 3: + if constexpr (FOLD >= 4) + return data[3]; + case 4: + if constexpr (FOLD >= 5) + return data[4]; + default: + return data[FOLD - 1]; + } + } + else + { + return data[i]; + } + } + + ///Return carry vector value const ref + inline const ftype& carry(int i) const { - return &data[0]; + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[FOLD + 0]; + case 1: + if constexpr (FOLD >= 2) + return data[FOLD + 1]; + case 2: + if constexpr (FOLD >= 3) + return data[FOLD + 2]; + case 3: + if constexpr (FOLD >= 4) + return data[FOLD + 3]; + case 4: + if constexpr (FOLD >= 5) + return data[FOLD + 4]; + default: + return data[2 * FOLD - 1]; + } + } + else + { + return data[FOLD + i]; + } } - ///Return a pointer to the carry vector - inline ftype* cvec() + + ///Return primary vector value ref + inline ftype& primary(int i) { - return &data[FOLD]; + const auto& c = *this; + return const_cast(c.primary(i)); } - ///Return a const pointer to the primary vector - inline const ftype* pvec() const + + ///Return carry vector value ref + inline ftype& carry(int i) { - return &data[0]; + const auto& c = *this; + return const_cast(c.carry(i)); } - ///Return a const pointer to the carry vector - inline const ftype* cvec() const + +#ifdef DISABLE_ZERO + static inline constexpr bool ISZERO(const ftype) { - return &data[FOLD]; + return false; } +#else + static inline constexpr bool ISZERO(const ftype x) + { + return x == 0.0; + } +#endif +#ifdef DISABLE_NANINF + static inline constexpr int ISNANINF(const ftype) + { + return false; + } +#else static inline constexpr int ISNANINF(const ftype x) { const auto bits = get_bits(x); return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) == ((2ull * MAX_EXP - 1) << (MANT_DIG - 1)); } +#endif static inline constexpr int EXP(const ftype x) { @@ -200,8 +363,8 @@ namespace RFA { } else { - frexp(x, &exp); - return std::min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + std::frexp(x, &exp); + return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); } } return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; @@ -213,7 +376,7 @@ namespace RFA { inline int binned_index() const { return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - - EXP(pvec()[0])) / + EXP(primary(0))) / BIN_WIDTH; } @@ -221,7 +384,7 @@ namespace RFA { ///A quick check to determine if the index is 0 inline bool binned_index0() const { - return EXP(pvec()[0]) == MAX_EXP + EXP_BIAS; + return EXP(primary(0)) == MAX_EXP + EXP_BIAS; } ///Update manually specified binned fp with a scalar (X -> Y) @@ -234,43 +397,40 @@ namespace RFA { void binned_dmdupdate( const ftype max_abs_val, const int incpriY, const int inccarY) { - int i; - int j; - int X_index; - int shift; - auto* const priY = pvec(); - auto* const carY = cvec(); - - if (ISNANINF(priY[0])) - { + if (ISNANINF(primary(0))) return; - } - X_index = binned_dindex(max_abs_val); - if (priY[0] == 0.0) + int X_index = binned_dindex(max_abs_val); + if (ISZERO(primary(0))) { const ftype* const bins = binned_bins(X_index); - for (i = 0; i < FOLD; i++) + for (int i = 0; i < FOLD; i++) { - priY[i * incpriY] = bins[i]; - carY[i * inccarY] = 0.0; + primary(i * incpriY) = bins[i]; + carry(i * inccarY) = 0.0; } } else { - shift = binned_index() - X_index; + int shift = binned_index() - X_index; if (shift > 0) { - for (i = FOLD - 1; i >= shift; i--) +#pragma unroll + for (int i = FOLD - 1; i >= 1; i--) { - priY[i * incpriY] = priY[(i - shift) * incpriY]; - carY[i * inccarY] = carY[(i - shift) * inccarY]; + if (i < shift) + break; + primary(i * incpriY) = primary((i - shift) * incpriY); + carry(i * inccarY) = carry((i - shift) * inccarY); } const ftype* const bins = binned_bins(X_index); - for (j = 0; j < i + 1; j++) +#pragma unroll + for (int j = 0; j < FOLD; j++) { - priY[j * incpriY] = bins[j]; - carY[j * inccarY] = 0.0; + if (j >= shift) + break; + primary(j * incpriY) = bins[j]; + carry(j * inccarY) = 0.0; } } } @@ -285,59 +445,59 @@ namespace RFA { void binned_dmddeposit(const ftype X, const int incpriY) { ftype M; - int i; ftype x = X; - auto* const priY = pvec(); - if (ISNANINF(x) || ISNANINF(priY[0])) + if (ISNANINF(x) || ISNANINF(primary(0))) { - priY[0] += x; + primary(0) += x; return; } if (binned_index0()) { - M = priY[0]; + M = primary(0); ftype qd = x * COMPRESSION; auto& ql = get_bits(qd); ql |= 1; qd += M; - priY[0] = qd; + primary(0) = qd; M -= qd; M *= EXPANSION * 0.5; x += M; x += M; - for (i = 1; i < FOLD - 1; i++) +#pragma unroll + for (int i = 1; i < FOLD - 1; i++) { - M = priY[i * incpriY]; + M = primary(i * incpriY); qd = x; ql |= 1; qd += M; - priY[i * incpriY] = qd; + primary(i * incpriY) = qd; M -= qd; x += M; } qd = x; ql |= 1; - priY[i * incpriY] += qd; + primary((FOLD - 1) * incpriY) += qd; } else { ftype qd = x; auto& ql = get_bits(qd); - for (i = 0; i < FOLD - 1; i++) +#pragma unroll + for (int i = 0; i < FOLD - 1; i++) { - M = priY[i * incpriY]; + M = primary(i * incpriY); qd = x; ql |= 1; qd += M; - priY[i * incpriY] = qd; + primary(i * incpriY) = qd; M -= qd; x += M; } qd = x; ql |= 1; - priY[i * incpriY] += qd; + primary((FOLD - 1) * incpriY) += qd; } } @@ -348,26 +508,22 @@ namespace RFA { /// ///@param incpriX stride within X's primary vector (use every incpriX'th element) ///@param inccarX stride within X's carry vector (use every inccarX'th element) - void binned_dmrenorm(const int incpriX, const int inccarX) + inline void binned_dmrenorm(const int incpriX, const int inccarX) { - auto* priX = pvec(); - auto* carX = cvec(); - - if (priX[0] == 0.0 || ISNANINF(priX[0])) - { + if (ISZERO(primary(0)) || ISNANINF(primary(0))) return; - } - for (int i = 0; i < FOLD; i++, priX += incpriX, carX += inccarX) + for (int i = 0; i < FOLD; i++) { - auto tmp_renormd = priX[0]; + auto tmp_renormd = primary(i * incpriX); auto& tmp_renorml = get_bits(tmp_renormd); - carX[0] += (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2; + carry(i * inccarX) += + (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2; tmp_renorml &= ~(1ull << (MANT_DIG - 3)); tmp_renorml |= 1ull << (MANT_DIG - 2); - priX[0] = tmp_renormd; + primary(i * incpriX) = tmp_renormd; } } @@ -392,18 +548,10 @@ namespace RFA { { int i = 0; - const auto* const priX = pvec(); - const auto* const carX = cvec(); - - if (ISNANINF(priX[0])) - { - return priX[0]; - } - - if (priX[0] == 0.0) - { + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) return 0.0; - } double Y = 0.0; double scale_down; @@ -413,30 +561,31 @@ namespace RFA { const auto* const bins = binned_bins(X_index); if (X_index <= (3 * MANT_DIG) / BIN_WIDTH) { - scale_down = ldexp_impl(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); - scale_up = ldexp_impl(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); + scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); + scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); scaled = std::max( std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); if (X_index == 0) { - Y += carX[0] * ((bins[0] / 6.0) * scale_down * EXPANSION); - Y += carX[inccarX] * ((bins[1] / 6.0) * scale_down); - Y += (priX[0] - bins[0]) * scale_down * EXPANSION; + Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); + Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); + Y += (primary(0) - bins[0]) * scale_down * EXPANSION; i = 2; } else { - Y += carX[0] * ((bins[0] / 6.0) * scale_down); + Y += carry(0) * ((bins[0] / 6.0) * scale_down); i = 1; } for (; i < scaled; i++) { - Y += carX[i * inccarX] * ((bins[i] / 6.0) * scale_down); - Y += (priX[(i - 1) * incpriX] - bins[i - 1]) * scale_down; + Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); + Y += + (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; } if (i == FOLD) { - Y += (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]) * + Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * scale_down; return Y * scale_up; } @@ -447,20 +596,20 @@ namespace RFA { Y *= scale_up; for (; i < FOLD; i++) { - Y += carX[i * inccarX] * (bins[i] / 6.0); - Y += priX[(i - 1) * incpriX] - bins[i - 1]; + Y += carry(i * inccarX) * (bins[i] / 6.0); + Y += primary((i - 1) * incpriX) - bins[i - 1]; } - Y += priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]; + Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; } else { - Y += carX[0] * (bins[0] / 6.0); + Y += carry(0) * (bins[0] / 6.0); for (i = 1; i < FOLD; i++) { - Y += carX[i * inccarX] * (bins[i] / 6.0); - Y += (priX[(i - 1) * incpriX] - bins[i - 1]); + Y += carry(i * inccarX) * (bins[i] / 6.0); + Y += (primary((i - 1) * incpriX) - bins[i - 1]); } - Y += (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]); + Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); } return Y; } @@ -473,18 +622,11 @@ namespace RFA { { int i = 0; double Y = 0.0; - const auto* const priX = pvec(); - const auto* const carX = cvec(); - - if (ISNANINF(priX[0])) - { - return priX[0]; - } - if (priX[0] == 0.0) - { + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) return 0.0; - } //Note that the following order of summation is in order of decreasing //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and @@ -493,23 +635,23 @@ namespace RFA { const auto* const bins = binned_bins(X_index); if (X_index == 0) { - Y += (double) carX[0] * (double) (bins[0] / 6.0) * + Y += (double) carry(0) * (double) (bins[0] / 6.0) * (double) EXPANSION; - Y += (double) carX[inccarX] * (double) (bins[1] / 6.0); - Y += (double) (priX[0] - bins[0]) * (double) EXPANSION; + Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); + Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; i = 2; } else { - Y += (double) carX[0] * (double) (bins[0] / 6.0); + Y += (double) carry(0) * (double) (bins[0] / 6.0); i = 1; } for (; i < FOLD; i++) { - Y += (double) carX[i * inccarX] * (double) (bins[i] / 6.0); - Y += (double) (priX[(i - 1) * incpriX] - bins[i - 1]); + Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } - Y += (double) (priX[(FOLD - 1) * incpriX] - bins[FOLD - 1]); + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); return (float) Y; } @@ -522,64 +664,78 @@ namespace RFA { ///@param inccarX stride within X's carry vector (use every inccarX'th element) ///@param incpriY stride within Y's primary vector (use every incpriY'th element) ///@param inccarY stride within Y's carry vector (use every inccarY'th element) - void binned_dmdmadd(const ReproducibleFloatingAccumulator& other, + void binned_dmdmadd(const ReproducibleFloatingAccumulator& x, const int incpriX, const int inccarX, const int incpriY, const int inccarY) { - auto* const priX = other.pvec(); - auto* const carX = other.cvec(); - auto* const priY = pvec(); - auto* const carY = cvec(); - - if (priX[0] == 0.0) + if (ISZERO(x.primary(0))) return; - if (priY[0] == 0.0) + if (ISZERO(primary(0))) { for (int i = 0; i < FOLD; i++) { - priY[i * incpriY] = priX[i * incpriX]; - carY[i * inccarY] = carX[i * inccarX]; + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); } return; } - if (ISNANINF(priX[0]) || ISNANINF(priY[0])) + if (ISNANINF(x.primary(0)) || ISNANINF(primary(0))) { - priY[0] += priX[0]; + primary(0) += x.primary(0); return; } - // const auto X_index = binned_index(priX); - const auto X_index = other.binned_index(); - const auto Y_index = binned_index(); + const auto X_index = x.binned_index(); + const auto Y_index = this->binned_index(); const auto shift = Y_index - X_index; if (shift > 0) { const auto* const bins = binned_bins(Y_index); //shift Y upwards and add X to Y - for (int i = FOLD - 1; i >= shift; i--) +#pragma unroll + for (int i = FOLD - 1; i >= 1; i--) { - priY[i * incpriY] = priX[i * incpriX] + - (priY[(i - shift) * incpriY] - bins[i - shift]); - carY[i * inccarY] = - carX[i * inccarX] + carY[(i - shift) * inccarY]; + if (i < shift) + break; + primary(i * incpriY) = x.primary(i * incpriX) + + (primary((i - shift) * incpriY) - bins[i - shift]); + carry(i * inccarY) = + x.carry(i * inccarX) + carry((i - shift) * inccarY); } - for (int i = 0; i < shift && i < FOLD; i++) +#pragma unroll + for (int i = 0; i < FOLD; i++) { - priY[i * incpriY] = priX[i * incpriX]; - carY[i * inccarY] = carX[i * inccarX]; + if (i == shift) + break; + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); } } - else + else if (shift < 0) { const auto* const bins = binned_bins(X_index); //shift X upwards and add X to Y - for (int i = 0 - shift; i < FOLD; i++) +#pragma unroll + for (int i = 0; i < FOLD; i++) { - priY[i * incpriY] += - priX[(i + shift) * incpriX] - bins[i + shift]; - carY[i * inccarY] += carX[(i + shift) * inccarX]; + if (i < -shift) + continue; + primary(i * incpriY) += + x.primary((i + shift) * incpriX) - bins[i + shift]; + carry(i * inccarY) += x.carry((i + shift) * inccarX); + } + } + else if (shift == 0) + { + const auto* const bins = binned_bins(X_index); + // add X to Y +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; + carry(i * inccarY) += x.carry(i * inccarX); } } @@ -594,6 +750,13 @@ namespace RFA { } public: + ReproducibleFloatingAccumulator() = default; + ReproducibleFloatingAccumulator( + const ReproducibleFloatingAccumulator&) = default; + ///Sets this binned fp equal to another binned fp + ReproducibleFloatingAccumulator& operator=( + const ReproducibleFloatingAccumulator&) = default; + ///Set the binned fp to zero void zero() { @@ -601,22 +764,27 @@ namespace RFA { } ///Return the fold of the binned fp - int fold() const + constexpr int fold() const { return FOLD; } + ///Return the endurance of the binned fp + constexpr int endurance() const + { + return ENDURANCE; + } + ///Returns the number of reference bins. Used for judging memory usage. - static constexpr size_t number_of_reference_bins() + constexpr size_t number_of_reference_bins() { - return bins.size(); + return std::array::size(); } ///Accumulate an arithmetic @p x into the binned fp. ///NOTE: Casts @p x to the type of the binned fp template ::value>::type* = - nullptr> + typename std::enable_if_t>* = nullptr> ReproducibleFloatingAccumulator& operator+=(const U x) { binned_dmdadd(static_cast(x), 1, 1); @@ -626,8 +794,7 @@ namespace RFA { ///Accumulate-subtract an arithmetic @p x into the binned fp. ///NOTE: Casts @p x to the type of the binned fp template ::value>::type* = - nullptr> + typename std::enable_if_t>* = nullptr> ReproducibleFloatingAccumulator& operator-=(const U x) { binned_dmdadd(-static_cast(x), 1, 1); @@ -666,8 +833,7 @@ namespace RFA { ///Sets this binned fp equal to the arithmetic value @p x ///NOTE: Casts @p x to the type of the binned fp template ::value>::type* = - nullptr> + typename std::enable_if_t>* = nullptr> ReproducibleFloatingAccumulator& operator=(const U x) { zero(); @@ -675,14 +841,6 @@ namespace RFA { return *this; } - ///Sets this binned fp equal to another binned fp - ReproducibleFloatingAccumulator& operator=( - const ReproducibleFloatingAccumulator& o) - { - data = o.data; - return *this; - } - ///Returns the negative of this binned fp ///NOTE: Makes a copy and performs arithmetic; slow. ReproducibleFloatingAccumulator operator-() @@ -690,14 +848,14 @@ namespace RFA { constexpr int incpriX = 1; constexpr int inccarX = 1; ReproducibleFloatingAccumulator temp = *this; - if (pvec()[0] != 0.0) + if (primary(0) != 0.0) { const auto* const bins = binned_bins(binned_index()); for (int i = 0; i < FOLD; i++) { - temp.pvec()[i * incpriX] = - bins[i] - (pvec()[i * incpriX] - bins[i]); - temp.cvec()[i * inccarX] = -cvec()[i * inccarX]; + temp.primary(i * incpriX) = + bins[i] - (primary(i * incpriX) - bins[i]); + temp.carry(i * inccarX) = -carry(i * inccarX); } } return temp; @@ -706,7 +864,7 @@ namespace RFA { ///Convert this binned fp into its native floating-point representation ftype conv() const { - if (std::is_same::value) + if (std::is_same_v) { return binned_conv_single(1, 1); } @@ -729,9 +887,8 @@ namespace RFA { { const double X = std::abs(max_abs_val); const double S = std::abs(binned_sum); - return static_cast( - std::max(X, ldexp_impl(0.5, MIN_EXP - 1)) * - ldexp_impl(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + + return static_cast(max(X, std::ldexp(0.5, MIN_EXP - 1)) * + std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + ((7.0 * EPSILON) / (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - 7.0 * EPSILON)) * @@ -754,10 +911,12 @@ namespace RFA { { binned_dmdupdate(std::abs(max_abs_val), 1, 1); size_t count = 0; + size_t N = last - first; for (; first != last; first++, count++) { binned_dmddeposit(static_cast(*first), 1); - if (count == ENDURANCE) + // first conditional allows compiler to remove the call here when possible + if (N > ENDURANCE && count == ENDURANCE) { binned_dmrenorm(1, 1); count = 0; @@ -788,14 +947,11 @@ namespace RFA { ///@param N Number of elements to add ///@param max_abs_val Maximum absolute value of any member of the range template ::value>::type* = - nullptr> + typename std::enable_if_t>* = nullptr> void add(const T* input, const size_t N, const ftype max_abs_val) { if (N == 0) - { return; - } add(input, input + N, max_abs_val); } @@ -807,19 +963,147 @@ namespace RFA { ///@param input Start of the range ///@param N Number of elements to add template ::value>::type* = - nullptr> + typename std::enable_if_t>* = nullptr> void add(const T* input, const size_t N) { if (N == 0) + return; + + T max_abs_val = input[0]; + for (size_t i = 0; i < N; i++) + { + max_abs_val = max(max_abs_val, std::abs(input[i])); + } + add(input, N, max_abs_val); + } + + ///Accumulate a float4 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float4& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + binned_dmddeposit(static_cast(x.z), 1); + binned_dmddeposit(static_cast(x.w), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const double2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + void add(const float4* input, const size_t N, float max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + binned_dmddeposit(static_cast(input[i].z), 1); + binned_dmddeposit(static_cast(input[i].w), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const double2* input, const size_t N, double max_abs_val) + { + if (N == 0) return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } } - T max_abs_val = input[0]; + } + + void add(const float2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; for (size_t i = 0; i < N; i++) { - max_abs_val = std::max(max_abs_val, std::abs(input[i])); + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } } + } + + void add(const float4* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const double2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const float2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + add(input, N, max_abs_val); } @@ -833,8 +1117,7 @@ namespace RFA { ///with `unsafe_add` after which `renorm()` must be called. See the source of ///`add()` for an example template ::value>::type* = - nullptr> + typename std::enable_if_t>* = nullptr> void set_max_abs_val(const T mav) { binned_dmdupdate(std::abs(mav), 1, 1); @@ -858,11 +1141,4 @@ namespace RFA { } }; - template <> - auto ReproducibleFloatingAccumulator::bins = - ReproducibleFloatingAccumulator::initialize_bins(); - - template <> - auto ReproducibleFloatingAccumulator::bins = - ReproducibleFloatingAccumulator::initialize_bins(); -} // namespace RFA +} // namespace hpx::parallel::detail::rfa \ No newline at end of file diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp deleted file mode 100644 index 05f71d9ae746..000000000000 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp +++ /dev/null @@ -1,1168 +0,0 @@ -//Reproducible Floating Point Accumulations via Binned Floating Point -//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. -//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. -// -//The code accomplishes several objectives: -// -//1. Reproducible summation, independent of summation order, assuming only a -// subset of the IEEE 754 Floating Point Standard -// -//2. Has accuracy at least as good as conventional summation, and tunable -// -//3. Handles overflow, underflow, and other exceptions reproducibly. -// -//4. Makes only one read-only pass over the summands. -// -//5. Requires only one parallel reduction. -// -//6. Uses minimal memory (6 doubles per accumulator with fold=3). -// -//7. Relatively easy to use - -#pragma once - -#include -#include -#include -#include -#include - -#ifndef __CUDACC__ -#define __host__ -#define __device__ -#define __forceinline__ -#include -using std::array; -using std::max; -using std::min; -#else -#include -using cuda::std::array; -using cuda::std::max; -using cuda::std::min; -#include "vector.hpp" -#endif - -namespace hpx::parallel::detail::rfa { - template - struct type4 - { - F x; - F y; - F z; - F w; - }; - - template - struct type2 - { - F x; - F y; - }; - using float4 = type4; - using double4 = type4; - using float2 = type2; - using double2 = type2; - - auto abs_max(float4 a) - { - auto x = std::abs(a.x); - auto y = std::abs(a.y); - auto z = std::abs(a.z); - auto w = std::abs(a.w); - const std::vector v = {x, y, z, w}; - return *std::max_element(v.begin(), v.end()); - } - - auto abs_max(double4 a) - { - auto x = std::abs(a.x); - auto y = std::abs(a.y); - auto z = std::abs(a.z); - auto w = std::abs(a.w); - const std::vector v = {x, y, z, w}; - return *std::max_element(v.begin(), v.end()); - } - - auto abs_max(float2 a) - { - auto x = std::abs(a.x); - auto y = std::abs(a.y); - const std::vector v = {x, y}; - return *std::max_element(v.begin(), v.end()); - } - - auto abs_max(double2 a) - { - auto x = std::abs(a.x); - auto y = std::abs(a.y); - const std::vector v = {x, y}; - return *std::max_element(v.begin(), v.end()); - } - -// disable zero checks -#define DISABLE_ZERO - -// disable nan / infinity checks -#define DISABLE_NANINF - -// jump table for indexing into data -#define MAX_JUMP 5 - static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max"); - - template - inline constexpr Real ldexp_impl(Real arg, int exp) noexcept - { - return std::ldexp(arg, exp); - // while (arg == 0) - // { - // return arg; - // } - // while (exp > 0) - // { - // arg *= static_cast(2); - // --exp; - // } - // while (exp < 0) - // { - // arg /= static_cast(2); - // ++exp; - // } - - // return arg; - } - - template - struct RFA_bins - { - static constexpr auto BIN_WIDTH = - std::is_same_v ? 40 : 13; - static constexpr auto MIN_EXP = - std::numeric_limits::min_exponent; - static constexpr auto MAX_EXP = - std::numeric_limits::max_exponent; - static constexpr auto MANT_DIG = std::numeric_limits::digits; - ///Binned floating-point maximum index - static constexpr auto MAXINDEX = - ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; - //The maximum floating-point fold supported by the library - static constexpr auto MAXFOLD = MAXINDEX + 1; - - ///The binned floating-point reference bins - array bins = {}; - - constexpr ftype& operator[](int d) - { - return bins[d]; - } - - void initialize_bins() - { - if constexpr (std::is_same_v) - { - bins[0] = std::ldexp(0.75, MAX_EXP); - } - else - { - bins[0] = 2.0 * ldexp(0.75, MAX_EXP - 1); - } - - for (int index = 1; index <= MAXINDEX; index++) - { - bins[index] = ldexp(0.75, - MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH); - } - for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++) - { - bins[index] = bins[index - 1]; - } - } - }; - - static char bin_host_buffer[sizeof(RFA_bins)]; -#ifdef __CUDACC__ - __constant__ static char bin_device_buffer[sizeof(RFA_bins)]; -#endif - - ///Class to hold a reproducible summation of the numbers passed to it - /// - ///@param ftype Floating-point data type; either `float` or `double - ///@param FOLD The fold; use 3 as a default unless you understand it. - template ::value>* = - nullptr> - class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator - { - public: - using ftype = ftype_; - static constexpr int FOLD = FOLD_; - - private: - array data = {0}; - - ///Floating-point precision bin width - static constexpr auto BIN_WIDTH = - std::is_same_v ? 40 : 13; - static constexpr auto MIN_EXP = - std::numeric_limits::min_exponent; - static constexpr auto MAX_EXP = - std::numeric_limits::max_exponent; - static constexpr auto MANT_DIG = std::numeric_limits::digits; - ///Binned floating-point maximum index - static constexpr auto MAXINDEX = - ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; - //The maximum floating-point fold supported by the library - static constexpr auto MAXFOLD = MAXINDEX + 1; - ///Binned floating-point compression factor - ///This factor is used to scale down inputs before deposition into the bin of - ///highest index - static constexpr auto COMPRESSION = - 1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1)); - ///Binned double precision expansion factor - ///This factor is used to scale up inputs after deposition into the bin of - ///highest index - static constexpr auto EXPANSION = - 1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1)); - static constexpr auto EXP_BIAS = MAX_EXP - 2; - static constexpr auto EPSILON = std::numeric_limits::epsilon(); - ///Binned floating-point deposit endurance - ///The number of deposits that can be performed before a renorm is necessary. - ///Applies also to binned complex double precision. - static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); - - ///Return a binned floating-point reference bin - inline const ftype* binned_bins(const int x) const - { -#ifdef __CUDA_ARCH__ // must be arch not CC here - return &reinterpret_cast&>(bin_device_buffer)[x]; -#else - return &reinterpret_cast&>(bin_host_buffer)[x]; -#endif - } - - ///Get the bit representation of a float - static inline uint32_t& get_bits(float& x) - { - return *reinterpret_cast(&x); - } - ///Get the bit representation of a double - static inline uint64_t& get_bits(double& x) - { - return *reinterpret_cast(&x); - } - ///Get the bit representation of a const float - static inline uint32_t get_bits(const float& x) - { - return *reinterpret_cast(&x); - } - ///Get the bit representation of a const double - static inline uint64_t get_bits(const double& x) - { - return *reinterpret_cast(&x); - } - - ///Return primary vector value const ref - inline const ftype& primary(int i) const - { - if constexpr (FOLD <= MAX_JUMP) - { - switch (i) - { - case 0: - if constexpr (FOLD >= 1) - return data[0]; - case 1: - if constexpr (FOLD >= 2) - return data[1]; - case 2: - if constexpr (FOLD >= 3) - return data[2]; - case 3: - if constexpr (FOLD >= 4) - return data[3]; - case 4: - if constexpr (FOLD >= 5) - return data[4]; - default: - return data[FOLD - 1]; - } - } - else - { - return data[i]; - } - } - - ///Return carry vector value const ref - inline const ftype& carry(int i) const - { - if constexpr (FOLD <= MAX_JUMP) - { - switch (i) - { - case 0: - if constexpr (FOLD >= 1) - return data[FOLD + 0]; - case 1: - if constexpr (FOLD >= 2) - return data[FOLD + 1]; - case 2: - if constexpr (FOLD >= 3) - return data[FOLD + 2]; - case 3: - if constexpr (FOLD >= 4) - return data[FOLD + 3]; - case 4: - if constexpr (FOLD >= 5) - return data[FOLD + 4]; - default: - return data[2 * FOLD - 1]; - } - } - else - { - return data[FOLD + i]; - } - } - - ///Return primary vector value ref - inline ftype& primary(int i) - { - const auto& c = *this; - return const_cast(c.primary(i)); - } - - ///Return carry vector value ref - inline ftype& carry(int i) - { - const auto& c = *this; - return const_cast(c.carry(i)); - } - -#ifdef DISABLE_ZERO - static inline constexpr bool ISZERO(const ftype) - { - return false; - } -#else - static inline constexpr bool ISZERO(const ftype x) - { - return x == 0.0; - } -#endif - -#ifdef DISABLE_NANINF - static inline constexpr int ISNANINF(const ftype) - { - return false; - } -#else - static inline constexpr int ISNANINF(const ftype x) - { - const auto bits = get_bits(x); - return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) == - ((2ull * MAX_EXP - 1) << (MANT_DIG - 1)); - } -#endif - - static inline constexpr int EXP(const ftype x) - { - const auto bits = get_bits(x); - return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1); - } - - ///Get index of float-point precision - ///The index of a non-binned type is the smallest index a binned type would - ///need to have to sum it reproducibly. Higher indicies correspond to smaller - ///bins. - static inline constexpr int binned_dindex(const ftype x) - { - int exp = EXP(x); - if (exp == 0) - { - if (x == 0.0) - { - return MAXINDEX; - } - else - { - frexp(x, &exp); - return min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); - } - } - return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; - } - - ///Get index of manually specified binned double precision - ///The index of a binned type is the bin that it corresponds to. Higher - ///indicies correspond to smaller bins. - inline int binned_index() const - { - return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - - EXP(primary(0))) / - BIN_WIDTH; - } - - ///Check if index of manually specified binned floating-point is 0 - ///A quick check to determine if the index is 0 - inline bool binned_index0() const - { - return EXP(primary(0)) == MAX_EXP + EXP_BIAS; - } - - ///Update manually specified binned fp with a scalar (X -> Y) - /// - ///This method updates the binned fp to an index suitable for adding numbers - ///with absolute value less than @p max_abs_val - /// - ///@param incpriY stride within Y's primary vector (use every incpriY'th element) - ///@param inccarY stride within Y's carry vector (use every inccarY'th element) - void binned_dmdupdate( - const ftype max_abs_val, const int incpriY, const int inccarY) - { - if (ISNANINF(primary(0))) - return; - - int X_index = binned_dindex(max_abs_val); - if (ISZERO(primary(0))) - { - const ftype* const bins = binned_bins(X_index); - for (int i = 0; i < FOLD; i++) - { - primary(i * incpriY) = bins[i]; - carry(i * inccarY) = 0.0; - } - } - else - { - int shift = binned_index() - X_index; - if (shift > 0) - { -#pragma unroll - for (int i = FOLD - 1; i >= 1; i--) - { - if (i < shift) - break; - primary(i * incpriY) = primary((i - shift) * incpriY); - carry(i * inccarY) = carry((i - shift) * inccarY); - } - const ftype* const bins = binned_bins(X_index); -#pragma unroll - for (int j = 0; j < FOLD; j++) - { - if (j >= shift) - break; - primary(j * incpriY) = bins[j]; - carry(j * inccarY) = 0.0; - } - } - } - } - - ///Add scalar @p X to suitably binned manually specified binned fp (Y += X) - /// - ///Performs the operation Y += X on an binned type Y where the index of Y is - ///larger than the index of @p X - /// - ///@param incpriY stride within Y's primary vector (use every incpriY'th element) - void binned_dmddeposit(const ftype X, const int incpriY) - { - ftype M; - ftype x = X; - - if (ISNANINF(x) || ISNANINF(primary(0))) - { - primary(0) += x; - return; - } - - if (binned_index0()) - { - M = primary(0); - ftype qd = x * COMPRESSION; - auto& ql = get_bits(qd); - ql |= 1; - qd += M; - primary(0) = qd; - M -= qd; - M *= EXPANSION * 0.5; - x += M; - x += M; -#pragma unroll - for (int i = 1; i < FOLD - 1; i++) - { - M = primary(i * incpriY); - qd = x; - ql |= 1; - qd += M; - primary(i * incpriY) = qd; - M -= qd; - x += M; - } - qd = x; - ql |= 1; - primary((FOLD - 1) * incpriY) += qd; - } - else - { - ftype qd = x; - auto& ql = get_bits(qd); -#pragma unroll - for (int i = 0; i < FOLD - 1; i++) - { - M = primary(i * incpriY); - qd = x; - ql |= 1; - qd += M; - primary(i * incpriY) = qd; - M -= qd; - x += M; - } - qd = x; - ql |= 1; - primary((FOLD - 1) * incpriY) += qd; - } - } - - ///Renormalize manually specified binned double precision - /// - ///Renormalization keeps the primary vector within the necessary bins by - ///shifting over to the carry vector - /// - ///@param incpriX stride within X's primary vector (use every incpriX'th element) - ///@param inccarX stride within X's carry vector (use every inccarX'th element) - inline void binned_dmrenorm(const int incpriX, const int inccarX) - { - if (ISZERO(primary(0)) || ISNANINF(primary(0))) - return; - - for (int i = 0; i < FOLD; i++) - { - auto tmp_renormd = primary(i * incpriX); - auto& tmp_renorml = get_bits(tmp_renormd); - - carry(i * inccarX) += - (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2; - - tmp_renorml &= ~(1ull << (MANT_DIG - 3)); - tmp_renorml |= 1ull << (MANT_DIG - 2); - primary(i * incpriX) = tmp_renormd; - } - } - - ///Add scalar to manually specified binned fp (Y += X) - /// - ///Performs the operation Y += X on an binned type Y - /// - ///@param incpriY stride within Y's primary vector (use every incpriY'th element) - ///@param inccarY stride within Y's carry vector (use every inccarY'th element) - void binned_dmdadd(const ftype X, const int incpriY, const int inccarY) - { - binned_dmdupdate(X, incpriY, inccarY); - binned_dmddeposit(X, incpriY); - binned_dmrenorm(incpriY, inccarY); - } - - ///Convert manually specified binned fp to native double-precision (X -> Y) - /// - ///@param incpriX stride within X's primary vector (use every incpriX'th element) - ///@param inccarX stride within X's carry vector (use every inccarX'th element) - double binned_conv_double(const int incpriX, const int inccarX) const - { - int i = 0; - - if (ISNANINF(primary(0))) - return primary(0); - if (ISZERO(primary(0))) - return 0.0; - - double Y = 0.0; - double scale_down; - double scale_up; - int scaled; - const auto X_index = binned_index(); - const auto* const bins = binned_bins(X_index); - if (X_index <= (3 * MANT_DIG) / BIN_WIDTH) - { - scale_down = ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); - scale_up = ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); - scaled = - max(min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); - if (X_index == 0) - { - Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); - Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); - Y += (primary(0) - bins[0]) * scale_down * EXPANSION; - i = 2; - } - else - { - Y += carry(0) * ((bins[0] / 6.0) * scale_down); - i = 1; - } - for (; i < scaled; i++) - { - Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); - Y += - (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; - } - if (i == FOLD) - { - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * - scale_down; - return Y * scale_up; - } - if (std::isinf(Y * scale_up)) - { - return Y * scale_up; - } - Y *= scale_up; - for (; i < FOLD; i++) - { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += primary((i - 1) * incpriX) - bins[i - 1]; - } - Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; - } - else - { - Y += carry(0) * (bins[0] / 6.0); - for (i = 1; i < FOLD; i++) - { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += (primary((i - 1) * incpriX) - bins[i - 1]); - } - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); - } - return Y; - } - - ///Convert manually specified binned fp to native single-precision (X -> Y) - /// - ///@param incpriX stride within X's primary vector (use every incpriX'th element) - ///@param inccarX stride within X's carry vector (use every inccarX'th element) - float binned_conv_single(const int incpriX, const int inccarX) const - { - int i = 0; - double Y = 0.0; - - if (ISNANINF(primary(0))) - return primary(0); - if (ISZERO(primary(0))) - return 0.0; - - //Note that the following order of summation is in order of decreasing - //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and - //the number of carries equal to 1. - const auto X_index = binned_index(); - const auto* const bins = binned_bins(X_index); - if (X_index == 0) - { - Y += (double) carry(0) * (double) (bins[0] / 6.0) * - (double) EXPANSION; - Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); - Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; - i = 2; - } - else - { - Y += (double) carry(0) * (double) (bins[0] / 6.0); - i = 1; - } - for (; i < FOLD; i++) - { - Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); - Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); - } - Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); - - return (float) Y; - } - - ///Add two manually specified binned fp (Y += X) - ///Performs the operation Y += X - /// - ///@param other Another binned fp of the same type - ///@param incpriX stride within X's primary vector (use every incpriX'th element) - ///@param inccarX stride within X's carry vector (use every inccarX'th element) - ///@param incpriY stride within Y's primary vector (use every incpriY'th element) - ///@param inccarY stride within Y's carry vector (use every inccarY'th element) - void binned_dmdmadd(const ReproducibleFloatingAccumulator& x, - const int incpriX, const int inccarX, const int incpriY, - const int inccarY) - { - if (ISZERO(x.primary(0))) - return; - - if (ISZERO(primary(0))) - { - for (int i = 0; i < FOLD; i++) - { - primary(i * incpriY) = x.primary(i * incpriX); - carry(i * inccarY) = x.carry(i * inccarX); - } - return; - } - - if (ISNANINF(x.primary(0)) || ISNANINF(primary(0))) - { - primary(0) += x.primary(0); - return; - } - - const auto X_index = x.binned_index(); - const auto Y_index = this->binned_index(); - const auto shift = Y_index - X_index; - if (shift > 0) - { - const auto* const bins = binned_bins(Y_index); - //shift Y upwards and add X to Y -#pragma unroll - for (int i = FOLD - 1; i >= 1; i--) - { - if (i < shift) - break; - primary(i * incpriY) = x.primary(i * incpriX) + - (primary((i - shift) * incpriY) - bins[i - shift]); - carry(i * inccarY) = - x.carry(i * inccarX) + carry((i - shift) * inccarY); - } -#pragma unroll - for (int i = 0; i < FOLD; i++) - { - if (i == shift) - break; - primary(i * incpriY) = x.primary(i * incpriX); - carry(i * inccarY) = x.carry(i * inccarX); - } - } - else if (shift < 0) - { - const auto* const bins = binned_bins(X_index); - //shift X upwards and add X to Y -#pragma unroll - for (int i = 0; i < FOLD; i++) - { - if (i < -shift) - continue; - primary(i * incpriY) += - x.primary((i + shift) * incpriX) - bins[i + shift]; - carry(i * inccarY) += x.carry((i + shift) * inccarX); - } - } - else if (shift == 0) - { - const auto* const bins = binned_bins(X_index); - // add X to Y -#pragma unroll - for (int i = 0; i < FOLD; i++) - { - primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; - carry(i * inccarY) += x.carry(i * inccarX); - } - } - - binned_dmrenorm(incpriY, inccarY); - } - - ///Add two manually specified binned fp (Y += X) - ///Performs the operation Y += X - void binned_dbdbadd(const ReproducibleFloatingAccumulator& other) - { - binned_dmdmadd(other, 1, 1, 1, 1); - } - - public: - ReproducibleFloatingAccumulator() = default; - ReproducibleFloatingAccumulator( - const ReproducibleFloatingAccumulator&) = default; - ///Sets this binned fp equal to another binned fp - ReproducibleFloatingAccumulator& operator=( - const ReproducibleFloatingAccumulator&) = default; - - ///Set the binned fp to zero - void zero() - { - data = {0}; - } - - ///Return the fold of the binned fp - constexpr int fold() const - { - return FOLD; - } - - ///Return the endurance of the binned fp - constexpr int endurance() const - { - return ENDURANCE; - } - - ///Returns the number of reference bins. Used for judging memory usage. - constexpr size_t number_of_reference_bins() - { - return array::size(); - } - - ///Accumulate an arithmetic @p x into the binned fp. - ///NOTE: Casts @p x to the type of the binned fp - template >* = nullptr> - ReproducibleFloatingAccumulator& operator+=(const U x) - { - binned_dmdadd(static_cast(x), 1, 1); - return *this; - } - - ///Accumulate-subtract an arithmetic @p x into the binned fp. - ///NOTE: Casts @p x to the type of the binned fp - template >* = nullptr> - ReproducibleFloatingAccumulator& operator-=(const U x) - { - binned_dmdadd(-static_cast(x), 1, 1); - return *this; - } - - ///Accumulate a binned fp @p x into the binned fp. - ReproducibleFloatingAccumulator& operator+=( - const ReproducibleFloatingAccumulator& other) - { - binned_dbdbadd(other); - return *this; - } - - ///Accumulate-subtract a binned fp @p x into the binned fp. - ///NOTE: Makes a copy and performs arithmetic; slow. - ReproducibleFloatingAccumulator& operator-=( - const ReproducibleFloatingAccumulator& other) - { - const auto temp = -other; - binned_dbdbadd(temp); - } - - ///Determines if two binned fp are equal - bool operator==(const ReproducibleFloatingAccumulator& other) const - { - return data == other.data; - } - - ///Determines if two binned fp are not equal - bool operator!=(const ReproducibleFloatingAccumulator& other) const - { - return !operator==(other); - } - - ///Sets this binned fp equal to the arithmetic value @p x - ///NOTE: Casts @p x to the type of the binned fp - template >* = nullptr> - ReproducibleFloatingAccumulator& operator=(const U x) - { - zero(); - binned_dmdadd(static_cast(x), 1, 1); - return *this; - } - - ///Returns the negative of this binned fp - ///NOTE: Makes a copy and performs arithmetic; slow. - ReproducibleFloatingAccumulator operator-() - { - constexpr int incpriX = 1; - constexpr int inccarX = 1; - ReproducibleFloatingAccumulator temp = *this; - if (primary(0) != 0.0) - { - const auto* const bins = binned_bins(binned_index()); - for (int i = 0; i < FOLD; i++) - { - temp.primary(i * incpriX) = - bins[i] - (primary(i * incpriX) - bins[i]); - temp.carry(i * inccarX) = -carry(i * inccarX); - } - } - return temp; - } - - ///Convert this binned fp into its native floating-point representation - ftype conv() const - { - if (std::is_same_v) - { - return binned_conv_single(1, 1); - } - else - { - return binned_conv_double(1, 1); - } - } - - ///@brief Get binned fp summation error bound - /// - ///This is a bound on the absolute error of a summation using binned types - /// - ///@param N The number of single precision floating point summands - ///@param max_abs_val The summand of maximum absolute value - ///@param binned_sum The value of the sum computed using binned types - ///@return The absolute error bound - static constexpr ftype error_bound( - const uint64_t N, const ftype max_abs_val, const ftype binned_sum) - { - const double X = std::abs(max_abs_val); - const double S = std::abs(binned_sum); - return static_cast(max(X, ldexp(0.5, MIN_EXP - 1)) * - ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + - ((7.0 * EPSILON) / - (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - - 7.0 * EPSILON)) * - S); - } - - ///Add @p x to the binned fp - void add(const ftype x) - { - binned_dmdadd(x, 1, 1); - } - - ///Add arithmetics in the range [first, last) to the binned fp - /// - ///@param first Start of range - ///@param last End of range - ///@param max_abs_val Maximum absolute value of any member of the range - template - void add(InputIt first, InputIt last, const ftype max_abs_val) - { - binned_dmdupdate(std::abs(max_abs_val), 1, 1); - size_t count = 0; - size_t N = last - first; - for (; first != last; first++, count++) - { - binned_dmddeposit(static_cast(*first), 1); - // first conditional allows compiler to remove the call here when possible - if (N > ENDURANCE && count == ENDURANCE) - { - binned_dmrenorm(1, 1); - count = 0; - } - } - } - - ///Add arithmetics in the range [first, last) to the binned fp - /// - ///NOTE: A maximum absolute value is calculated, so two passes are made over - /// the data - /// - ///@param first Start of range - ///@param last End of range - template - void add(InputIt first, InputIt last) - { - const auto max_abs_val = *std::max_element( - first, last, [](const auto& a, const auto& b) { - return std::abs(a) < std::abs(b); - }); - add(first, last, static_cast(max_abs_val)); - } - - ///Add @p N elements starting at @p input to the binned fp: [input, input+N) - /// - ///@param input Start of the range - ///@param N Number of elements to add - ///@param max_abs_val Maximum absolute value of any member of the range - template >* = nullptr> - void add(const T* input, const size_t N, const ftype max_abs_val) - { - if (N == 0) - return; - add(input, input + N, max_abs_val); - } - - ///Add @p N elements starting at @p input to the binned fp: [input, input+N) - /// - ///NOTE: A maximum absolute value is calculated, so two passes are made over - /// the data - /// - ///@param input Start of the range - ///@param N Number of elements to add - template >* = nullptr> - void add(const T* input, const size_t N) - { - if (N == 0) - return; - - T max_abs_val = input[0]; - for (size_t i = 0; i < N; i++) - { - max_abs_val = max(max_abs_val, std::abs(input[i])); - } - add(input, N, max_abs_val); - } - - ///Accumulate a float4 @p x into the binned fp. - ///NOTE: Casts @p x to the type of the binned fp - ReproducibleFloatingAccumulator& operator+=(const float4& x) - { - binned_dmdupdate(abs_max(x), 1, 1); - binned_dmddeposit(static_cast(x.x), 1); - binned_dmddeposit(static_cast(x.y), 1); - binned_dmddeposit(static_cast(x.z), 1); - binned_dmddeposit(static_cast(x.w), 1); - return *this; - } - - ///Accumulate a double2 @p x into the binned fp. - ///NOTE: Casts @p x to the type of the binned fp - ReproducibleFloatingAccumulator& operator+=(const float2& x) - { - binned_dmdupdate(abs_max(x), 1, 1); - binned_dmddeposit(static_cast(x.x), 1); - binned_dmddeposit(static_cast(x.y), 1); - return *this; - } - - ///Accumulate a double2 @p x into the binned fp. - ///NOTE: Casts @p x to the type of the binned fp - ReproducibleFloatingAccumulator& operator+=(const double2& x) - { - binned_dmdupdate(abs_max(x), 1, 1); - binned_dmddeposit(static_cast(x.x), 1); - binned_dmddeposit(static_cast(x.y), 1); - return *this; - } - - void add(const float4* input, const size_t N, float max_abs_val) - { - if (N == 0) - return; - binned_dmdupdate(max_abs_val, 1, 1); - - size_t count = 0; - for (size_t i = 0; i < N; i++) - { - binned_dmddeposit(static_cast(input[i].x), 1); - binned_dmddeposit(static_cast(input[i].y), 1); - binned_dmddeposit(static_cast(input[i].z), 1); - binned_dmddeposit(static_cast(input[i].w), 1); - - if (N > ENDURANCE && count == ENDURANCE) - { - binned_dmrenorm(1, 1); - count = 0; - } - } - } - - void add(const double2* input, const size_t N, double max_abs_val) - { - if (N == 0) - return; - binned_dmdupdate(max_abs_val, 1, 1); - - size_t count = 0; - for (size_t i = 0; i < N; i++) - { - binned_dmddeposit(static_cast(input[i].x), 1); - binned_dmddeposit(static_cast(input[i].y), 1); - - if (N > ENDURANCE && count == ENDURANCE) - { - binned_dmrenorm(1, 1); - count = 0; - } - } - } - - void add(const float2* input, const size_t N, double max_abs_val) - { - if (N == 0) - return; - binned_dmdupdate(max_abs_val, 1, 1); - - size_t count = 0; - for (size_t i = 0; i < N; i++) - { - binned_dmddeposit(static_cast(input[i].x), 1); - binned_dmddeposit(static_cast(input[i].y), 1); - - if (N > ENDURANCE && count == ENDURANCE) - { - binned_dmrenorm(1, 1); - count = 0; - } - } - } - - void add(const float4* input, const size_t N) - { - if (N == 0) - return; - - auto max_abs_val = abs_max(input[0]); - for (size_t i = 1; i < N; i++) - max_abs_val = fmax(max_abs_val, abs_max(input[i])); - - add(input, N, max_abs_val); - } - - void add(const double2* input, const size_t N) - { - if (N == 0) - return; - - auto max_abs_val = abs_max(input[0]); - for (size_t i = 1; i < N; i++) - max_abs_val = fmax(max_abs_val, abs_max(input[i])); - - add(input, N, max_abs_val); - } - - void add(const float2* input, const size_t N) - { - if (N == 0) - return; - - auto max_abs_val = abs_max(input[0]); - for (size_t i = 1; i < N; i++) - max_abs_val = fmax(max_abs_val, abs_max(input[i])); - - add(input, N, max_abs_val); - } - - ////////////////////////////////////// - //MANUAL OPERATIONS; USE WISELY - ////////////////////////////////////// - - ///Rebins for repeated accumulation of scalars with magnitude <= @p mav - /// - ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator - ///with `unsafe_add` after which `renorm()` must be called. See the source of - ///`add()` for an example - template >* = nullptr> - void set_max_abs_val(const T mav) - { - binned_dmdupdate(std::abs(mav), 1, 1); - } - - ///Add @p x to the binned fp - /// - ///This is intended to be used after a call to `set_max_abs_val()` - void unsafe_add(const ftype x) - { - binned_dmddeposit(x, 1); - } - - ///Renormalizes the binned fp - /// - ///This is intended to be used after a call to `set_max_abs_val()` and one or - ///more calls to `unsafe_add()` - void renorm() - { - binned_dmrenorm(1, 1); - } - }; - - -} // namespace hpx::parallel::detail::rfa \ No newline at end of file diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index 462b1473f332..0a67dea03026 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include From 6a99390451e0f3f261a0f8aaa1ffe63e1fa6b37b Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Mon, 9 Dec 2024 18:52:14 -0600 Subject: [PATCH 6/9] Added parallel execution of rfa reduction summation Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 93 +++++++++++++++++++ .../algorithms/reduce_deterministic.hpp | 74 ++++++++------- .../unit/algorithms/reduce_deterministic.cpp | 44 +++++++++ 3 files changed, 178 insertions(+), 33 deletions(-) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index 76f093837def..b2de030eed8b 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,84 @@ namespace hpx::parallel::detail { } }; + template + struct sequential_reduce_deterministic_rfa_t final + : hpx::functional::detail::tag_fallback< + sequential_reduce_deterministic_rfa_t> + { + private: + template + friend constexpr hpx::parallel::detail::rfa:: + ReproducibleFloatingAccumulator + tag_fallback_invoke(sequential_reduce_deterministic_rfa_t, + ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r) + { + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + + hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + + for (auto e = first; e != last; ++e) + { + rfa += *e; + } + return rfa; + } + + template + friend constexpr hpx::parallel::detail::rfa:: + ReproducibleFloatingAccumulator + tag_fallback_invoke(sequential_reduce_deterministic_rfa_t, + ExPolicy&&, InIterB first, std::size_t size, T init, Reduce&& r) + { + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + + hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + auto e = first; + for (std::size_t i = 0; i < size; ++i, ++e) + { + rfa += *e; + } + return rfa; + } + + // template , + // // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator< + // // double>>::value> + // > + // friend constexpr T tag_fallback_invoke( + // sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first, + // InIterE last, T init, Reduce&& r) + // { + // static_assert(hpx::util::contains, + // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator< + // double>>::value); + // hpx::parallel::detail::rfa::RFA_bins bins; + // bins.initialize_bins(); + // std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + + // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + // rfa.set_max_abs_val(init); + // rfa.unsafe_add(init); + // rfa.renorm(); + // for (auto e = first; e != last; ++e) + // { + // rfa += *e; + // } + // return rfa.conv(); + // } + }; + #if !defined(HPX_COMPUTE_DEVICE_CODE) template inline constexpr sequential_reduce_deterministic_t @@ -77,4 +156,18 @@ namespace hpx::parallel::detail { } #endif +#if !defined(HPX_COMPUTE_DEVICE_CODE) + template + inline constexpr sequential_reduce_deterministic_rfa_t + sequential_reduce_deterministic_rfa = + sequential_reduce_deterministic_rfa_t{}; +#else + template + HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic_rfa( + Args&&... args) + { + return sequential_reduce_deterministic_rfa_t{}( + std::forward(args)...); + } +#endif } // namespace hpx::parallel::detail diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp index 56c8d3349898..b8435eec9a02 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -10,6 +10,7 @@ #pragma once +#include "detail/reduce_deterministic.hpp" #if defined(DOXYGEN) namespace hpx { @@ -396,42 +397,49 @@ namespace hpx::parallel { static constexpr T sequential(ExPolicy&& policy, InIterB first, InIterE last, T_&& init, Reduce&& r) { - // hpx::parallel::detail::sequential_reduce_deterministic_t seq; - // return seq(policy,first,last,0.0f,r); - return hpx::parallel::detail::sequential_reduce_deterministic( - HPX_FORWARD(ExPolicy, policy), first, last, + return hpx::parallel::detail::sequential_reduce_deterministic< + ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last, HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r)); } - // template - // static util::detail::algorithm_result_t parallel( - // ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init, - // Reduce&& r) - // { - // if (first == last) - // { - // return util::detail::algorithm_result::get( - // HPX_FORWARD(T_, init)); - // } - - // auto f1 = [r](FwdIterB part_begin, std::size_t part_size) -> T { - // T val = *part_begin; - // return detail::sequential_reduce( - // ++part_begin, --part_size, HPX_MOVE(val), r); - // }; - - // return util::partitioner::call( - // HPX_FORWARD(ExPolicy, policy), first, - // detail::distance(first, last), HPX_MOVE(f1), - // hpx::unwrapping( - // [init = HPX_FORWARD(T_, init), - // r = HPX_FORWARD(Reduce, r)](auto&& results) -> T { - // return detail::sequential_reduce( - // hpx::util::begin(results), - // hpx::util::size(results), init, r); - // })); - // } + template + static util::detail::algorithm_result_t parallel( + ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init, + Reduce&& r) + { + if (first == last) + { + return util::detail::algorithm_result::get( + HPX_FORWARD(T_, init)); + } + + auto f1 = [r, policy]( + FwdIterB part_begin, std::size_t part_size) + -> hpx::parallel::detail::rfa:: + ReproducibleFloatingAccumulator { + T val = *part_begin; + return hpx::parallel::detail:: + sequential_reduce_deterministic_rfa( + HPX_FORWARD(ExPolicy, policy), ++part_begin, + --part_size, HPX_MOVE(val), r); + }; + + return util::partitioner>::call(HPX_FORWARD(ExPolicy, policy), first, + detail::distance(first, last), HPX_MOVE(f1), + hpx::unwrapping([init = HPX_FORWARD(T_, init), + r = HPX_FORWARD(Reduce, r), + policy](auto&& results) -> T { + return hpx::parallel::detail:: + sequential_reduce_deterministic_rfa( + HPX_FORWARD(ExPolicy, policy), + hpx::util::begin(results), + hpx::util::size(results), init, r) + .conv(); + })); + } }; /// \endcond } // namespace detail diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index 0a67dea03026..694b50cfb76d 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -79,6 +79,49 @@ void test_reduce1(IteratorTag) HPX_TEST_EQ(r2, r3); } +template +void test_reduce_parallel1(IteratorTag) +{ + // check if different type for deterministic and nondeeterministic + // and same result i.e. correct computation + using base_iterator_det = std::vector::iterator; + using iterator_det = test::test_iterator; + + using base_iterator_ndet = std::vector::iterator; + using iterator_ndet = test::test_iterator; + + std::vector deterministic(LEN); + std::vector nondeterministic(LEN); + + std::iota( + deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0)); + + std::iota(nondeterministic.begin(), nondeterministic.end(), + FloatTypeNonDeterministic(0)); + + FloatTypeDeterministic val_det(0); + FloatTypeNonDeterministic val_non_det(0); + auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { + return v1 + v2; + }; + + FloatTypeDeterministic r1 = hpx::reduce_deterministic(hpx::execution::par, + iterator_det(std::begin(deterministic)), + iterator_det(std::end(deterministic)), val_det, op); + + // verify values + // FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::par, + // iterator_ndet(std::begin(nondeterministic)), + // iterator_ndet(std::end(nondeterministic)), val_non_det, op); + + FloatTypeNonDeterministic r3 = std::accumulate( + nondeterministic.begin(), nondeterministic.end(), val_non_det); + + HPX_TEST_EQ(r1, r3); + // HPX_TEST_EQ(r2, r3); +} + template void test_reduce_determinism(IteratorTag) @@ -178,6 +221,7 @@ void test_reduce1() test_reduce1(IteratorTag()); test_reduce1(IteratorTag()); test_reduce1(IteratorTag()); + test_reduce_parallel1(IteratorTag()); } template From 520f1613edd947bdb30e3702d39c7d9bb6f96d29 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Mon, 16 Dec 2024 19:24:47 -0600 Subject: [PATCH 7/9] Add reproducible summation reduction - Currently supports sequential version of reduction - Tests determinism by shuffling order of floating point numbers Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 80 ++ .../hpx/parallel/algorithms/detail/rfa.hpp | 1145 +++++++++++++++++ .../algorithms/reduce_deterministic.hpp | 537 ++++++++ .../tests/unit/algorithms/CMakeLists.txt | 1 + .../unit/algorithms/reduce_deterministic.cpp | 272 ++++ 5 files changed, 2035 insertions(+) create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp create mode 100644 libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp new file mode 100644 index 000000000000..87069858f492 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -0,0 +1,80 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "rfa.hpp" + +namespace hpx::parallel::detail { + + template + struct sequential_reduce_deterministic_t final + : hpx::functional::detail::tag_fallback< + sequential_reduce_deterministic_t> + { + private: + template + friend constexpr T tag_fallback_invoke( + sequential_reduce_deterministic_t, ExPolicy&&, InIterB first, + InIterE last, T init, Reduce&& r) + { + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + + hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + rfa.set_max_abs_val(init); + rfa.unsafe_add(init); + rfa.renorm(); + size_t count = 0; + T max_val = std::abs(*first); + for (auto e = first; e != last; ++e) + { + T temp_max_val = std::abs(static_cast(*e)); + if (max_val < temp_max_val) + { + rfa.set_max_abs_val(temp_max_val); + max_val = temp_max_val; + } + rfa.unsafe_add(*e); + count++; + if (count == rfa.endurance()) + { + rfa.renorm(); + count = 0; + } + } + return rfa.conv(); + } + }; + +#if !defined(HPX_COMPUTE_DEVICE_CODE) + template + inline constexpr sequential_reduce_deterministic_t + sequential_reduce_deterministic = + sequential_reduce_deterministic_t{}; +#else + template + HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic( + Args&&... args) + { + return sequential_reduce_deterministic_t{}( + std::forward(args)...); + } +#endif + +} // namespace hpx::parallel::detail diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp new file mode 100644 index 000000000000..302f823fab71 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -0,0 +1,1145 @@ +//Reproducible Floating Point Accumulations via Binned Floating Point +//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. +//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. +// +//The code accomplishes several objectives: +// +//1. Reproducible summation, independent of summation order, assuming only a +// subset of the IEEE 754 Floating Point Standard +// +//2. Has accuracy at least as good as conventional summation, and tunable +// +//3. Handles overflow, underflow, and other exceptions reproducibly. +// +//4. Makes only one read-only pass over the summands. +// +//5. Requires only one parallel reduction. +// +//6. Uses minimal memory (6 doubles per accumulator with fold=3). +// +//7. Relatively easy to use + +#pragma once + +#include +#include +#include +#include +#include + +namespace hpx::parallel::detail::rfa { + template + struct type4 + { + F x; + F y; + F z; + F w; + }; + + template + struct type2 + { + F x; + F y; + }; + using float4 = type4; + using double4 = type4; + using float2 = type2; + using double2 = type2; + + auto abs_max(float4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(float2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + +// disable zero checks +#define DISABLE_ZERO + +// disable nan / infinity checks +#define DISABLE_NANINF + +// jump table for indexing into data +#define MAX_JUMP 5 + static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max"); + + template + inline constexpr Real ldexp_impl(Real arg, int exp) noexcept + { + return std::ldexp(arg, exp); + // while (arg == 0) + // { + // return arg; + // } + // while (exp > 0) + // { + // arg *= static_cast(2); + // --exp; + // } + // while (exp < 0) + // { + // arg /= static_cast(2); + // ++exp; + // } + + // return arg; + } + + template + struct RFA_bins + { + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + + ///The binned floating-point reference bins + std::array bins = {}; + + constexpr ftype& operator[](int d) + { + return bins[d]; + } + + void initialize_bins() + { + if constexpr (std::is_same_v) + { + bins[0] = std::ldexp(0.75, MAX_EXP); + } + else + { + bins[0] = 2.0 * std::ldexp(0.75, MAX_EXP - 1); + } + + for (int index = 1; index <= MAXINDEX; index++) + { + bins[index] = std::ldexp(0.75, + MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH); + } + for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++) + { + bins[index] = bins[index - 1]; + } + } + }; + + static char __rfa_bin_host_buffer__[sizeof(RFA_bins)]; + + ///Class to hold a reproducible summation of the numbers passed to it + /// + ///@param ftype Floating-point data type; either `float` or `double + ///@param FOLD The fold; use 3 as a default unless you understand it. + template ::value>* = + nullptr> + class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator + { + public: + using ftype = ftype_; + static constexpr int FOLD = FOLD_; + + private: + std::array data = {0}; + + ///Floating-point precision bin width + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + ///Binned floating-point compression factor + ///This factor is used to scale down inputs before deposition into the bin of + ///highest index + static constexpr auto COMPRESSION = + 1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1)); + ///Binned double precision expansion factor + ///This factor is used to scale up inputs after deposition into the bin of + ///highest index + static constexpr auto EXPANSION = + 1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1)); + static constexpr auto EXP_BIAS = MAX_EXP - 2; + static constexpr auto EPSILON = std::numeric_limits::epsilon(); + ///Binned floating-point deposit endurance + ///The number of deposits that can be performed before a renorm is necessary. + ///Applies also to binned complex double precision. + static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); + + ///Return a binned floating-point reference bin + inline const ftype* binned_bins(const int x) const + { + return &reinterpret_cast&>( + __rfa_bin_host_buffer__)[x]; + } + + ///Get the bit representation of a float + static inline uint32_t& get_bits(float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a double + static inline uint64_t& get_bits(double& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const float + static inline uint32_t get_bits(const float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const double + static inline uint64_t get_bits(const double& x) + { + return *reinterpret_cast(&x); + } + + ///Return primary vector value const ref + inline const ftype& primary(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[0]; + case 1: + if constexpr (FOLD >= 2) + return data[1]; + case 2: + if constexpr (FOLD >= 3) + return data[2]; + case 3: + if constexpr (FOLD >= 4) + return data[3]; + case 4: + if constexpr (FOLD >= 5) + return data[4]; + default: + return data[FOLD - 1]; + } + } + else + { + return data[i]; + } + } + + ///Return carry vector value const ref + inline const ftype& carry(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[FOLD + 0]; + case 1: + if constexpr (FOLD >= 2) + return data[FOLD + 1]; + case 2: + if constexpr (FOLD >= 3) + return data[FOLD + 2]; + case 3: + if constexpr (FOLD >= 4) + return data[FOLD + 3]; + case 4: + if constexpr (FOLD >= 5) + return data[FOLD + 4]; + default: + return data[2 * FOLD - 1]; + } + } + else + { + return data[FOLD + i]; + } + } + + ///Return primary vector value ref + inline ftype& primary(int i) + { + const auto& c = *this; + return const_cast(c.primary(i)); + } + + ///Return carry vector value ref + inline ftype& carry(int i) + { + const auto& c = *this; + return const_cast(c.carry(i)); + } + +#ifdef DISABLE_ZERO + static inline constexpr bool ISZERO(const ftype) + { + return false; + } +#else + static inline constexpr bool ISZERO(const ftype x) + { + return x == 0.0; + } +#endif + +#ifdef DISABLE_NANINF + static inline constexpr int ISNANINF(const ftype) + { + return false; + } +#else + static inline constexpr int ISNANINF(const ftype x) + { + const auto bits = get_bits(x); + return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) == + ((2ull * MAX_EXP - 1) << (MANT_DIG - 1)); + } +#endif + + static inline constexpr int EXP(const ftype x) + { + const auto bits = get_bits(x); + return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1); + } + + ///Get index of float-point precision + ///The index of a non-binned type is the smallest index a binned type would + ///need to have to sum it reproducibly. Higher indicies correspond to smaller + ///bins. + static inline constexpr int binned_dindex(const ftype x) + { + int exp = EXP(x); + if (exp == 0) + { + if (x == 0.0) + { + return MAXINDEX; + } + else + { + std::frexp(x, &exp); + return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + } + } + return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; + } + + ///Get index of manually specified binned double precision + ///The index of a binned type is the bin that it corresponds to. Higher + ///indicies correspond to smaller bins. + inline int binned_index() const + { + return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - + EXP(primary(0))) / + BIN_WIDTH; + } + + ///Check if index of manually specified binned floating-point is 0 + ///A quick check to determine if the index is 0 + inline bool binned_index0() const + { + return EXP(primary(0)) == MAX_EXP + EXP_BIAS; + } + + ///Update manually specified binned fp with a scalar (X -> Y) + /// + ///This method updates the binned fp to an index suitable for adding numbers + ///with absolute value less than @p max_abs_val + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdupdate( + const ftype max_abs_val, const int incpriY, const int inccarY) + { + if (ISNANINF(primary(0))) + return; + + int X_index = binned_dindex(max_abs_val); + if (ISZERO(primary(0))) + { + const ftype* const bins = binned_bins(X_index); + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) = bins[i]; + carry(i * inccarY) = 0.0; + } + } + else + { + int shift = binned_index() - X_index; + if (shift > 0) + { +#pragma unroll + for (int i = FOLD - 1; i >= 1; i--) + { + if (i < shift) + break; + primary(i * incpriY) = primary((i - shift) * incpriY); + carry(i * inccarY) = carry((i - shift) * inccarY); + } + const ftype* const bins = binned_bins(X_index); +#pragma unroll + for (int j = 0; j < FOLD; j++) + { + if (j >= shift) + break; + primary(j * incpriY) = bins[j]; + carry(j * inccarY) = 0.0; + } + } + } + } + + ///Add scalar @p X to suitably binned manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y where the index of Y is + ///larger than the index of @p X + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + void binned_dmddeposit(const ftype X, const int incpriY) + { + ftype M; + ftype x = X; + + if (ISNANINF(x) || ISNANINF(primary(0))) + { + primary(0) += x; + return; + } + + if (binned_index0()) + { + M = primary(0); + ftype qd = x * COMPRESSION; + auto& ql = get_bits(qd); + ql |= 1; + qd += M; + primary(0) = qd; + M -= qd; + M *= EXPANSION * 0.5; + x += M; + x += M; +#pragma unroll + for (int i = 1; i < FOLD - 1; i++) + { + M = primary(i * incpriY); + qd = x; + ql |= 1; + qd += M; + primary(i * incpriY) = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + primary((FOLD - 1) * incpriY) += qd; + } + else + { + ftype qd = x; + auto& ql = get_bits(qd); +#pragma unroll + for (int i = 0; i < FOLD - 1; i++) + { + M = primary(i * incpriY); + qd = x; + ql |= 1; + qd += M; + primary(i * incpriY) = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + primary((FOLD - 1) * incpriY) += qd; + } + } + + ///Renormalize manually specified binned double precision + /// + ///Renormalization keeps the primary vector within the necessary bins by + ///shifting over to the carry vector + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + inline void binned_dmrenorm(const int incpriX, const int inccarX) + { + if (ISZERO(primary(0)) || ISNANINF(primary(0))) + return; + + for (int i = 0; i < FOLD; i++) + { + auto tmp_renormd = primary(i * incpriX); + auto& tmp_renorml = get_bits(tmp_renormd); + + carry(i * inccarX) += + (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2; + + tmp_renorml &= ~(1ull << (MANT_DIG - 3)); + tmp_renorml |= 1ull << (MANT_DIG - 2); + primary(i * incpriX) = tmp_renormd; + } + } + + ///Add scalar to manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdadd(const ftype X, const int incpriY, const int inccarY) + { + binned_dmdupdate(X, incpriY, inccarY); + binned_dmddeposit(X, incpriY); + binned_dmrenorm(incpriY, inccarY); + } + + ///Convert manually specified binned fp to native double-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + double binned_conv_double(const int incpriX, const int inccarX) const + { + int i = 0; + + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) + return 0.0; + + double Y = 0.0; + double scale_down; + double scale_up; + int scaled; + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index <= (3 * MANT_DIG) / BIN_WIDTH) + { + scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); + scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); + scaled = std::max( + std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); + if (X_index == 0) + { + Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); + Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); + Y += (primary(0) - bins[0]) * scale_down * EXPANSION; + i = 2; + } + else + { + Y += carry(0) * ((bins[0] / 6.0) * scale_down); + i = 1; + } + for (; i < scaled; i++) + { + Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); + Y += + (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; + } + if (i == FOLD) + { + Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * + scale_down; + return Y * scale_up; + } + if (std::isinf(Y * scale_up)) + { + return Y * scale_up; + } + Y *= scale_up; + for (; i < FOLD; i++) + { + Y += carry(i * inccarX) * (bins[i] / 6.0); + Y += primary((i - 1) * incpriX) - bins[i - 1]; + } + Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; + } + else + { + Y += carry(0) * (bins[0] / 6.0); + for (i = 1; i < FOLD; i++) + { + Y += carry(i * inccarX) * (bins[i] / 6.0); + Y += (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + } + return Y; + } + + ///Convert manually specified binned fp to native single-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + float binned_conv_single(const int incpriX, const int inccarX) const + { + int i = 0; + double Y = 0.0; + + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) + return 0.0; + + //Note that the following order of summation is in order of decreasing + //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and + //the number of carries equal to 1. + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index == 0) + { + Y += (double) carry(0) * (double) (bins[0] / 6.0) * + (double) EXPANSION; + Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); + Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; + i = 2; + } + else + { + Y += (double) carry(0) * (double) (bins[0] / 6.0); + i = 1; + } + for (; i < FOLD; i++) + { + Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + + return (float) Y; + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + /// + ///@param other Another binned fp of the same type + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdmadd(const ReproducibleFloatingAccumulator& x, + const int incpriX, const int inccarX, const int incpriY, + const int inccarY) + { + if (ISZERO(x.primary(0))) + return; + + if (ISZERO(primary(0))) + { + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); + } + return; + } + + if (ISNANINF(x.primary(0)) || ISNANINF(primary(0))) + { + primary(0) += x.primary(0); + return; + } + + const auto X_index = x.binned_index(); + const auto Y_index = this->binned_index(); + const auto shift = Y_index - X_index; + if (shift > 0) + { + const auto* const bins = binned_bins(Y_index); + //shift Y upwards and add X to Y +#pragma unroll + for (int i = FOLD - 1; i >= 1; i--) + { + if (i < shift) + break; + primary(i * incpriY) = x.primary(i * incpriX) + + (primary((i - shift) * incpriY) - bins[i - shift]); + carry(i * inccarY) = + x.carry(i * inccarX) + carry((i - shift) * inccarY); + } +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + if (i == shift) + break; + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); + } + } + else if (shift < 0) + { + const auto* const bins = binned_bins(X_index); + //shift X upwards and add X to Y +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + if (i < -shift) + continue; + primary(i * incpriY) += + x.primary((i + shift) * incpriX) - bins[i + shift]; + carry(i * inccarY) += x.carry((i + shift) * inccarX); + } + } + else if (shift == 0) + { + const auto* const bins = binned_bins(X_index); + // add X to Y +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; + carry(i * inccarY) += x.carry(i * inccarX); + } + } + + binned_dmrenorm(incpriY, inccarY); + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + void binned_dbdbadd(const ReproducibleFloatingAccumulator& other) + { + binned_dmdmadd(other, 1, 1, 1, 1); + } + + public: + ReproducibleFloatingAccumulator() = default; + ReproducibleFloatingAccumulator( + const ReproducibleFloatingAccumulator&) = default; + ///Sets this binned fp equal to another binned fp + ReproducibleFloatingAccumulator& operator=( + const ReproducibleFloatingAccumulator&) = default; + + ///Set the binned fp to zero + void zero() + { + data = {0}; + } + + ///Return the fold of the binned fp + constexpr int fold() const + { + return FOLD; + } + + ///Return the endurance of the binned fp + constexpr int endurance() const + { + return ENDURANCE; + } + + ///Returns the number of reference bins. Used for judging memory usage. + constexpr size_t number_of_reference_bins() + { + return std::array::size(); + } + + ///Accumulate an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator+=(const U x) + { + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Accumulate-subtract an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator-=(const U x) + { + binned_dmdadd(-static_cast(x), 1, 1); + return *this; + } + + ///Accumulate a binned fp @p x into the binned fp. + ReproducibleFloatingAccumulator& operator+=( + const ReproducibleFloatingAccumulator& other) + { + binned_dbdbadd(other); + return *this; + } + + ///Accumulate-subtract a binned fp @p x into the binned fp. + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator& operator-=( + const ReproducibleFloatingAccumulator& other) + { + const auto temp = -other; + binned_dbdbadd(temp); + } + + ///Determines if two binned fp are equal + bool operator==(const ReproducibleFloatingAccumulator& other) const + { + return data == other.data; + } + + ///Determines if two binned fp are not equal + bool operator!=(const ReproducibleFloatingAccumulator& other) const + { + return !operator==(other); + } + + ///Sets this binned fp equal to the arithmetic value @p x + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator=(const U x) + { + zero(); + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Returns the negative of this binned fp + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator operator-() + { + constexpr int incpriX = 1; + constexpr int inccarX = 1; + ReproducibleFloatingAccumulator temp = *this; + if (primary(0) != 0.0) + { + const auto* const bins = binned_bins(binned_index()); + for (int i = 0; i < FOLD; i++) + { + temp.primary(i * incpriX) = + bins[i] - (primary(i * incpriX) - bins[i]); + temp.carry(i * inccarX) = -carry(i * inccarX); + } + } + return temp; + } + + ///Convert this binned fp into its native floating-point representation + ftype conv() const + { + if (std::is_same_v) + { + return binned_conv_single(1, 1); + } + else + { + return binned_conv_double(1, 1); + } + } + + ///@brief Get binned fp summation error bound + /// + ///This is a bound on the absolute error of a summation using binned types + /// + ///@param N The number of single precision floating point summands + ///@param max_abs_val The summand of maximum absolute value + ///@param binned_sum The value of the sum computed using binned types + ///@return The absolute error bound + static constexpr ftype error_bound( + const uint64_t N, const ftype max_abs_val, const ftype binned_sum) + { + const double X = std::abs(max_abs_val); + const double S = std::abs(binned_sum); + return static_cast(max(X, std::ldexp(0.5, MIN_EXP - 1)) * + std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + + ((7.0 * EPSILON) / + (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - + 7.0 * EPSILON)) * + S); + } + + ///Add @p x to the binned fp + void add(const ftype x) + { + binned_dmdadd(x, 1, 1); + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///@param first Start of range + ///@param last End of range + ///@param max_abs_val Maximum absolute value of any member of the range + template + void add(InputIt first, InputIt last, const ftype max_abs_val) + { + binned_dmdupdate(std::abs(max_abs_val), 1, 1); + size_t count = 0; + size_t N = last - first; + for (; first != last; first++, count++) + { + binned_dmddeposit(static_cast(*first), 1); + // first conditional allows compiler to remove the call here when possible + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param first Start of range + ///@param last End of range + template + void add(InputIt first, InputIt last) + { + const auto max_abs_val = *std::max_element( + first, last, [](const auto& a, const auto& b) { + return std::abs(a) < std::abs(b); + }); + add(first, last, static_cast(max_abs_val)); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///@param input Start of the range + ///@param N Number of elements to add + ///@param max_abs_val Maximum absolute value of any member of the range + template >* = nullptr> + void add(const T* input, const size_t N, const ftype max_abs_val) + { + if (N == 0) + return; + add(input, input + N, max_abs_val); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param input Start of the range + ///@param N Number of elements to add + template >* = nullptr> + void add(const T* input, const size_t N) + { + if (N == 0) + return; + + T max_abs_val = input[0]; + for (size_t i = 0; i < N; i++) + { + max_abs_val = max(max_abs_val, std::abs(input[i])); + } + add(input, N, max_abs_val); + } + + ///Accumulate a float4 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float4& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + binned_dmddeposit(static_cast(x.z), 1); + binned_dmddeposit(static_cast(x.w), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const double2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + void add(const float4* input, const size_t N, float max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + binned_dmddeposit(static_cast(input[i].z), 1); + binned_dmddeposit(static_cast(input[i].w), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const double2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const float2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const float4* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const double2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const float2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + ////////////////////////////////////// + //MANUAL OPERATIONS; USE WISELY + ////////////////////////////////////// + + ///Rebins for repeated accumulation of scalars with magnitude <= @p mav + /// + ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator + ///with `unsafe_add` after which `renorm()` must be called. See the source of + ///`add()` for an example + template >* = nullptr> + void set_max_abs_val(const T mav) + { + binned_dmdupdate(std::abs(mav), 1, 1); + } + + ///Add @p x to the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` + void unsafe_add(const ftype x) + { + binned_dmddeposit(x, 1); + } + + ///Renormalizes the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` and one or + ///more calls to `unsafe_add()` + void renorm() + { + binned_dmrenorm(1, 1); + } + }; + +} // namespace hpx::parallel::detail::rfa \ No newline at end of file diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp new file mode 100644 index 000000000000..9920d3bd7a6e --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -0,0 +1,537 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +/// \file parallel/algorithms/reduce_deterministic.hpp +/// \page hpx::reduce_deterministic +/// \headerfile hpx/algorithm.hpp + +#pragma once + +#if defined(DOXYGEN) + +namespace hpx { + + // clang-format off + + /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// predicate \a f. + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// \tparam F The type of the function/function object to use + /// (deduced). Unlike its sequential form, the parallel + /// overload of \a reduce requires \a F to meet the + /// requirements of \a CopyConstructible. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// \param f Specifies the function (or function object) which + /// will be invoked for each of the elements in the + /// sequence specified by [first, last). This is a + /// binary predicate. The signature of this predicate + /// should be equivalent to: + /// \code + /// Ret fun(const Type1 &a, const Type1 &b); + /// \endcode \n + /// The signature does not need to have const&. + /// The types \a Type1 \a Ret must be + /// such that an object of type \a FwdIter can be + /// dereferenced and then implicitly converted to any + /// of those types. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a copy_if algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns \a T otherwise. + /// The \a reduce algorithm returns the result of the + /// generalized sum over the elements given by the input range + /// [first, last). + /// + /// \note GENERALIZED_SUM(op, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + hpx::parallel::util::detail::algorithm_result_t + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init, F&& f); + + /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a copy_if algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns \a T otherwise. + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + util::detail::algorithm_result_t + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init); + + /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns T otherwise (where T is the value_type of + /// \a FwdIter). + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note The type of the initial value (and the result type) \a T is + /// determined from the value_type of the used \a FwdIter. + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template + typename hpx::parallel::util::detail::algorithm_result::value_type + >::type + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last); + + /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// predicate \a f. + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// \tparam F The type of the function/function object to use + /// (deduced). Unlike its sequential form, the parallel + /// overload of \a reduce requires \a F to meet the + /// requirements of \a CopyConstructible. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// \param f Specifies the function (or function object) which + /// will be invoked for each of the elements in the + /// sequence specified by [first, last). This is a + /// binary predicate. The signature of this predicate + /// should be equivalent to: + /// \code + /// Ret fun(const Type1 &a, const Type1 &b); + /// \endcode \n + /// The signature does not need to have const&. + /// The types \a Type1 \a Ret must be + /// such that an object of type \a InIter can be + /// dereferenced and then implicitly converted to any + /// of those types. + /// + /// \returns The \a reduce algorithm returns \a T. + /// The \a reduce algorithm returns the result of the + /// generalized sum over the elements given by the input range + /// [first, last). + /// + /// \note GENERALIZED_SUM(op, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + T reduce_deterministic(FwdIter first, FwdIter last, T init, F&& f); + + /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// + /// \returns The \a reduce algorithm returns a \a T. + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + T reduce_deterministic(FwdIter first, FwdIter last, T init); + + /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// + /// \returns The \a reduce algorithm returns \a T (where T is the + /// value_type of \a FwdIter). + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note The type of the initial value (and the result type) \a T is + /// determined from the value_type of the used \a FwdIter. + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template + typename std::iterator_traits::value_type + reduce_deterministic(FwdIter first, FwdIter last); + // clang-format on +} // namespace hpx + +#else // DOXYGEN + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace hpx::parallel { + + /////////////////////////////////////////////////////////////////////////// + // reduce + namespace detail { + + /// \cond NOINTERNAL + template + struct reduce_deterministic + : public algorithm, T> + { + constexpr reduce_deterministic() noexcept + : algorithm("reduce_deterministic") + { + } + + template + static constexpr T sequential(ExPolicy&& policy, InIterB first, + InIterE last, T_&& init, Reduce&& r) + { + return hpx::parallel::detail::sequential_reduce_deterministic< + ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last, + HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r)); + } + }; + /// \endcond + } // namespace detail +} // namespace hpx::parallel + +namespace hpx { + + /////////////////////////////////////////////////////////////////////////// + // CPO for hpx::reduce + inline constexpr struct reduce_deterministic_t final + : hpx::detail::tag_parallel_algorithm + { + private: + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::is_execution_policy_v && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last, T init, F f) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init), + HPX_MOVE(f)); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::is_execution_policy_v && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last, T init) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init), + std::plus<>{}); + } + + // clang-format off + template && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t::value_type> + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + using value_type = + typename std::iterator_traits::value_type; + + return hpx::parallel::detail::reduce_deterministic() + .call(HPX_FORWARD(ExPolicy, policy), first, last, value_type{}, + std::plus<>{}); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::traits::is_iterator_v + )> + // clang-format on + friend T tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last, T init, F f) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + hpx::execution::seq, first, last, HPX_MOVE(init), HPX_MOVE(f)); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::traits::is_iterator_v + )> + // clang-format on + friend T tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last, T init) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + hpx::execution::seq, first, last, HPX_MOVE(init), + std::plus<>{}); + } + + // clang-format off + template + )> + // clang-format on + friend typename std::iterator_traits::value_type + tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + using value_type = + typename std::iterator_traits::value_type; + + return hpx::parallel::detail::reduce_deterministic() + .call(hpx::execution::seq, first, last, value_type{}, + std::plus<>()); + } + } reduce_deterministic{}; +} // namespace hpx + +#endif // DOXYGEN diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt index 7d3a9204530c..559ee830030e 100644 --- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt @@ -94,6 +94,7 @@ set(tests partition_copy reduce_ reduce_by_key + reduce_deterministic remove remove1 remove2 diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp new file mode 100644 index 000000000000..92dd2e7f3dc2 --- /dev/null +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -0,0 +1,272 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_utils.hpp" + +int seed = std::random_device{}(); +std::mt19937 gen(seed); + +template +T get_rand( + T LO = std::numeric_limits::min(), T HI = std::numeric_limits::max()) +{ + return LO + + static_cast(std::rand()) / (static_cast(RAND_MAX / (HI - LO))); +} + +/////////////////////////////////////////////////////////////////////////////// + +template +void test_reduce1(IteratorTag) +{ + // check if different type for deterministic and nondeeterministic + // and same result i.e. correct computation + using base_iterator_det = std::vector::iterator; + using iterator_det = test::test_iterator; + + using base_iterator_ndet = std::vector::iterator; + using iterator_ndet = test::test_iterator; + + std::vector deterministic(LEN); + std::vector nondeterministic(LEN); + + std::iota( + deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0)); + + std::iota(nondeterministic.begin(), nondeterministic.end(), + FloatTypeNonDeterministic(0)); + + FloatTypeDeterministic val_det(0); + FloatTypeNonDeterministic val_non_det(0); + auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { + return v1 + v2; + }; + + FloatTypeDeterministic r1 = + hpx::reduce_deterministic(iterator_det(std::begin(deterministic)), + iterator_det(std::end(deterministic)), val_det, op); + + // verify values + FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic)), + iterator_ndet(std::end(nondeterministic)), val_non_det, op); + + FloatTypeNonDeterministic r3 = std::accumulate( + nondeterministic.begin(), nondeterministic.end(), val_non_det); + + HPX_TEST_EQ(r1, r3); + HPX_TEST_EQ(r2, r3); +} + +template +void test_reduce_determinism(IteratorTag) +{ + // check if different type for deterministic and nondeeterministic + // and same result + using base_iterator_det = std::vector::iterator; + using iterator_det = test::test_iterator; + + constexpr FloatTypeDeterministic num_bounds_det = + std::is_same_v ? 1000.0 : 1000000.0; + + std::vector deterministic(LEN); + + for (size_t i = 0; i < LEN; ++i) + { + deterministic[i] = + get_rand(-num_bounds_det, num_bounds_det); + } + + std::vector deterministic_shuffled = deterministic; + + std::shuffle( + deterministic_shuffled.begin(), deterministic_shuffled.end(), gen); + + FloatTypeDeterministic val_det(41.999); + + auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) { + return v1 + v2; + }; + + FloatTypeDeterministic r1 = + hpx::reduce_deterministic(iterator_det(std::begin(deterministic)), + iterator_det(std::end(deterministic)), val_det, op); + + FloatTypeDeterministic r1_shuffled = hpx::reduce_deterministic( + iterator_det(std::begin(deterministic_shuffled)), + iterator_det(std::end(deterministic_shuffled)), val_det, op); + + HPX_TEST_EQ(r1, + r1_shuffled); // Deterministically calculated, should always satisfy +} + +/// This test function is never called because it is not guaranteed to pass +/// It serves an important purpose to demonstrate that floating point summation +/// is not always associative i.e. a+b+c != a+c+b +template +void test_orig_reduce_determinism(IteratorTag) +{ + using base_iterator_ndet = std::vector::iterator; + using iterator_ndet = test::test_iterator; + + constexpr auto num_bounds_ndet = + std::is_same_v ? 1000.0f : 1000000.0f; + + std::vector nondeterministic(LEN); + for (size_t i = 0; i < LEN; ++i) + { + nondeterministic[i] = get_rand( + -num_bounds_ndet, num_bounds_ndet); + } + std::vector nondeterministic_shuffled = + nondeterministic; + std::shuffle(nondeterministic_shuffled.begin(), + nondeterministic_shuffled.end(), gen); + + FloatTypeNonDeterministic val_non_det(41.999); + + auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { + return v1 + v2; + }; + + FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic)), + iterator_ndet(std::end(nondeterministic)), val_non_det, op); + FloatTypeNonDeterministic r2_shuffled = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic_shuffled)), + iterator_ndet(std::end(nondeterministic_shuffled)), val_non_det, op); + + FloatTypeNonDeterministic r3 = std::accumulate( + nondeterministic.begin(), nondeterministic.end(), val_non_det); + FloatTypeNonDeterministic r3_shuffled = + std::accumulate(nondeterministic_shuffled.begin(), + nondeterministic_shuffled.end(), val_non_det); + + /// failed around 131 times out of 1000 on macOS arm + /// Floating point addition is not necessarily associative, + /// might fail on an architecture not yet known with much higher precision + HPX_TEST_NEQ(r2, r2_shuffled); + HPX_TEST_NEQ(r3, r3_shuffled); +} + +template +void test_reduce1() +{ + using namespace hpx::execution; + + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); +} + +template +void test_reduce2() +{ + using namespace hpx::execution; + + test_reduce_determinism(IteratorTag()); + test_reduce_determinism(IteratorTag()); +} + +// template +// void test_reduce3() +// { +// using namespace hpx::execution; + +// test_orig_reduce_determinism(IteratorTag()); +// test_orig_reduce_determinism(IteratorTag()); +// } + +void reduce_test1() +{ + test_reduce1(); + test_reduce2(); + // test_reduce3(); + test_reduce1(); + test_reduce2(); +} + +/////////////////////////////////////////////////////////////////////////////// +int hpx_main(hpx::program_options::variables_map& vm) +{ + unsigned int seed = (unsigned int) std::time(nullptr); + bool seed_random = false; + + if (vm.count("seed")) + { + seed = vm["seed"].as(); + seed_random = true; + } + + if (vm.count("seed-random")) + seed_random = vm["seed-random"].as(); + + if (seed_random) + { + std::cout << "using seed: " << seed << std::endl; + std::cout << "** std::accumulate, hpx::reduce may fail due to " + "non-determinism of the floating summation" + << std::endl; + gen.seed(seed); + std::srand(seed); + } + else + { + gen.seed(223); + std::srand(223); + } + + reduce_test1(); + + return hpx::local::finalize(); +} + +int main(int argc, char* argv[]) +{ + // add command line option which controls the random number generator seed + using namespace hpx::program_options; + options_description desc_commandline( + "Usage: " HPX_APPLICATION_STRING " [options]"); + + desc_commandline.add_options()("seed,s", value(), + "the random number generator seed to use for this run"); + desc_commandline.add_options()("seed-random", value(), + "switch for the random number generator seed to use for this run"); + + // By default this test should run on all available cores + std::vector const cfg = {"hpx.os_threads=all"}; + + // Initialize and run HPX + hpx::local::init_params init_args; + init_args.desc_cmdline = desc_commandline; + init_args.cfg = cfg; + + HPX_TEST_EQ_MSG(hpx::local::init(hpx_main, argc, argv, init_args), 0, + "HPX main exited with non-zero status"); + + return hpx::util::report_errors(); +} From 8b0ce4cd5e34598840e5f28851341de4d8d01213 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 17 Dec 2024 14:22:55 -0500 Subject: [PATCH 8/9] Address inspect tool, check module cmakelists, warnings and spell check - missing includes - prevent max/min being expanded as macros - minor spell check correction - remove pragma once in cpp file - resolve implicit type conversions in rfa type to single and double and other places - add dual license - remove unnecessary command for macos ci - use HPX_UNROLL instead of vanilla pragma - clang-17 cannot unroll so use checks - add typename qualifier for iterator type Signed-off-by: Shreyas Atre --- .github/workflows/macos_debug_fetch_hwloc.yml | 1 - libs/core/algorithms/CMakeLists.txt | 3 + .../detail/reduce_deterministic.hpp | 3 + .../hpx/parallel/algorithms/detail/rfa.hpp | 159 ++++++++++++------ .../unit/algorithms/reduce_deterministic.cpp | 26 +-- 5 files changed, 133 insertions(+), 59 deletions(-) diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml index f778a6b117d7..caec2af3ac25 100644 --- a/.github/workflows/macos_debug_fetch_hwloc.yml +++ b/.github/workflows/macos_debug_fetch_hwloc.yml @@ -19,7 +19,6 @@ jobs: run: | brew install --overwrite python-tk && \ brew install --overwrite boost gperftools ninja autoconf automake && \ - autoreconf -f -i \ brew upgrade cmake - name: Configure shell: bash diff --git a/libs/core/algorithms/CMakeLists.txt b/libs/core/algorithms/CMakeLists.txt index 6fcfed897e2f..9090345722df 100644 --- a/libs/core/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/CMakeLists.txt @@ -37,7 +37,9 @@ set(algorithms_headers hpx/parallel/algorithms/detail/parallel_stable_sort.hpp hpx/parallel/algorithms/detail/pivot.hpp hpx/parallel/algorithms/detail/reduce.hpp + hpx/parallel/algorithms/detail/reduce_deterministic.hpp hpx/parallel/algorithms/detail/replace.hpp + hpx/parallel/algorithms/detail/rfa.hpp hpx/parallel/algorithms/detail/rotate.hpp hpx/parallel/algorithms/detail/sample_sort.hpp hpx/parallel/algorithms/detail/search.hpp @@ -72,6 +74,7 @@ set(algorithms_headers hpx/parallel/algorithms/partition.hpp hpx/parallel/algorithms/reduce_by_key.hpp hpx/parallel/algorithms/reduce.hpp + hpx/parallel/algorithms/reduce_deterministic.hpp hpx/parallel/algorithms/remove_copy.hpp hpx/parallel/algorithms/remove.hpp hpx/parallel/algorithms/replace.hpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index 87069858f492..b37730889172 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -32,6 +33,8 @@ namespace hpx::parallel::detail { sequential_reduce_deterministic_t, ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r) { + /// TODO: Put constraint on Reduce to be a binary plus operator + (void) r; hpx::parallel::detail::rfa::RFA_bins bins; bins.initialize_bins(); std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp index 302f823fab71..fa9142cdf80b 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -1,3 +1,34 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// --------------------------------------------------------------------------- +// This file has been taken from +// https://github.com/maddyscientist/reproducible_floating_sums commit +// b5a065741d4ea459437ca004b508de9dcb6a3e52. The boost copyright has been added +// to this file in accordance with the dual license terms for the Reproducible +// Floating-Point Summations and conformance with the HPX policy +// https://github.com/maddyscientist/reproducible_floating_sums/blob/feature/cuda/LICENSE.md +// --------------------------------------------------------------------------- +// +/// Copyright 2022 Richard Barnes, Peter Ahrens, James Demmel +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// The above copyright notice and this permission notice shall be included in +/// all copies or substantial portions of the Software. +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. //Reproducible Floating Point Accumulations via Binned Floating Point //Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. //ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. @@ -26,6 +57,10 @@ #include #include #include +#include +#include + +#include namespace hpx::parallel::detail::rfa { template @@ -179,7 +214,7 @@ namespace hpx::parallel::detail::rfa { static constexpr int FOLD = FOLD_; private: - std::array data = {0}; + std::array data = {{0}}; ///Floating-point precision bin width static constexpr auto BIN_WIDTH = @@ -351,21 +386,21 @@ namespace hpx::parallel::detail::rfa { ///Get index of float-point precision ///The index of a non-binned type is the smallest index a binned type would - ///need to have to sum it reproducibly. Higher indicies correspond to smaller + ///need to have to sum it reproducibly. Higher indices correspond to smaller ///bins. static inline constexpr int binned_dindex(const ftype x) { int exp = EXP(x); if (exp == 0) { - if (x == 0.0) + if (x == static_cast(0.0)) { return MAXINDEX; } else { std::frexp(x, &exp); - return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + return (std::max)((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); } } return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; @@ -373,7 +408,7 @@ namespace hpx::parallel::detail::rfa { ///Get index of manually specified binned double precision ///The index of a binned type is the bin that it corresponds to. Higher - ///indicies correspond to smaller bins. + ///indices correspond to smaller bins. inline int binned_index() const { return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - @@ -416,7 +451,9 @@ namespace hpx::parallel::detail::rfa { int shift = binned_index() - X_index; if (shift > 0) { -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = FOLD - 1; i >= 1; i--) { if (i < shift) @@ -425,7 +462,9 @@ namespace hpx::parallel::detail::rfa { carry(i * inccarY) = carry((i - shift) * inccarY); } const ftype* const bins = binned_bins(X_index); -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int j = 0; j < FOLD; j++) { if (j >= shift) @@ -457,16 +496,19 @@ namespace hpx::parallel::detail::rfa { if (binned_index0()) { M = primary(0); - ftype qd = x * COMPRESSION; + ftype qd = x * static_cast(COMPRESSION); auto& ql = get_bits(qd); ql |= 1; qd += M; primary(0) = qd; M -= qd; - M *= EXPANSION * 0.5; + auto temp_m = (double) (((double) EXPANSION) * 0.5); + M *= static_cast(temp_m); x += M; x += M; -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 1; i < FOLD - 1; i++) { M = primary(i * incpriY); @@ -485,7 +527,9 @@ namespace hpx::parallel::detail::rfa { { ftype qd = x; auto& ql = get_bits(qd); -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD - 1; i++) { M = primary(i * incpriY); @@ -550,7 +594,7 @@ namespace hpx::parallel::detail::rfa { int i = 0; if (ISNANINF(primary(0))) - return primary(0); + return (double) primary(0); if (ISZERO(primary(0))) return 0.0; @@ -564,29 +608,36 @@ namespace hpx::parallel::detail::rfa { { scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); - scaled = std::max( - std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); + scaled = (std::max)( + (std::min)(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); if (X_index == 0) { - Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); - Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); - Y += (primary(0) - bins[0]) * scale_down * EXPANSION; + Y += ((double) carry(0)) * + ((((double) bins[0]) / 6.0) * scale_down * EXPANSION); + Y += ((double) carry(inccarX)) * + ((((double) bins[1]) / 6.0) * scale_down); + Y += ((double) primary(0) - (double) bins[0]) * scale_down * + EXPANSION; i = 2; } else { - Y += carry(0) * ((bins[0] / 6.0) * scale_down); + Y += ((double) carry(0)) * + (((double) bins[0] / 6.0) * scale_down); i = 1; } for (; i < scaled; i++) { - Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); - Y += - (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; + Y += ((double) carry(i * inccarX)) * + (((double) bins[i] / 6.0) * scale_down); + Y += ((double) primary((i - 1) * incpriX) - + (double) (bins[i - 1])) * + scale_down; } if (i == FOLD) { - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * + Y += ((double) primary((FOLD - 1) * incpriX) - + (double) (bins[FOLD - 1])) * scale_down; return Y * scale_up; } @@ -597,20 +648,23 @@ namespace hpx::parallel::detail::rfa { Y *= scale_up; for (; i < FOLD; i++) { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += primary((i - 1) * incpriX) - bins[i - 1]; + Y += ((double) carry(i * inccarX)) * + ((double) bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } - Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; + Y += ((double) primary((FOLD - 1) * incpriX) - + ((double) bins[FOLD - 1])); } else { - Y += carry(0) * (bins[0] / 6.0); + Y += ((double) carry(0)) * ((double) bins[0] / 6.0); for (i = 1; i < FOLD; i++) { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += (primary((i - 1) * incpriX) - bins[i - 1]); + Y += ((double) carry(i * inccarX)) * + ((double) bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); } return Y; } @@ -627,7 +681,7 @@ namespace hpx::parallel::detail::rfa { if (ISNANINF(primary(0))) return primary(0); if (ISZERO(primary(0))) - return 0.0; + return 0.0f; //Note that the following order of summation is in order of decreasing //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and @@ -636,20 +690,22 @@ namespace hpx::parallel::detail::rfa { const auto* const bins = binned_bins(X_index); if (X_index == 0) { - Y += (double) carry(0) * (double) (bins[0] / 6.0) * + Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0) * (double) EXPANSION; - Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); + Y += (double) carry(inccarX) * + (double) (((double) bins[1]) / 6.0); Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; i = 2; } else { - Y += (double) carry(0) * (double) (bins[0] / 6.0); + Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0); i = 1; } for (; i < FOLD; i++) { - Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); + Y += (double) carry(i * inccarX) * + (double) (((double) bins[i]) / 6.0); Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); @@ -694,8 +750,10 @@ namespace hpx::parallel::detail::rfa { if (shift > 0) { const auto* const bins = binned_bins(Y_index); - //shift Y upwards and add X to Y -#pragma unroll +//shift Y upwards and add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = FOLD - 1; i >= 1; i--) { if (i < shift) @@ -705,7 +763,9 @@ namespace hpx::parallel::detail::rfa { carry(i * inccarY) = x.carry(i * inccarX) + carry((i - shift) * inccarY); } -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { if (i == shift) @@ -717,8 +777,10 @@ namespace hpx::parallel::detail::rfa { else if (shift < 0) { const auto* const bins = binned_bins(X_index); - //shift X upwards and add X to Y -#pragma unroll +//shift X upwards and add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { if (i < -shift) @@ -731,8 +793,10 @@ namespace hpx::parallel::detail::rfa { else if (shift == 0) { const auto* const bins = binned_bins(X_index); - // add X to Y -#pragma unroll +// add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; @@ -771,7 +835,7 @@ namespace hpx::parallel::detail::rfa { } ///Return the endurance of the binned fp - constexpr int endurance() const + constexpr size_t endurance() const { return ENDURANCE; } @@ -867,11 +931,11 @@ namespace hpx::parallel::detail::rfa { { if (std::is_same_v) { - return binned_conv_single(1, 1); + return static_cast(binned_conv_single(1, 1)); } else { - return binned_conv_double(1, 1); + return static_cast(binned_conv_double(1, 1)); } } @@ -888,7 +952,8 @@ namespace hpx::parallel::detail::rfa { { const double X = std::abs(max_abs_val); const double S = std::abs(binned_sum); - return static_cast(max(X, std::ldexp(0.5, MIN_EXP - 1)) * + return static_cast( + (std::max)(X, std::ldexp(0.5, MIN_EXP - 1)) * std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + ((7.0 * EPSILON) / (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - @@ -973,7 +1038,7 @@ namespace hpx::parallel::detail::rfa { T max_abs_val = input[0]; for (size_t i = 0; i < N; i++) { - max_abs_val = max(max_abs_val, std::abs(input[i])); + max_abs_val = (std::max)(max_abs_val, std::abs(input[i])); } add(input, N, max_abs_val); } @@ -1142,4 +1207,4 @@ namespace hpx::parallel::detail::rfa { } }; -} // namespace hpx::parallel::detail::rfa \ No newline at end of file +} // namespace hpx::parallel::detail::rfa diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index 92dd2e7f3dc2..5a06c509efdc 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -4,8 +4,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#pragma once - #include #include #include @@ -19,6 +17,7 @@ #include #include #include +#include #include #include "test_utils.hpp" @@ -27,11 +26,12 @@ int seed = std::random_device{}(); std::mt19937 gen(seed); template -T get_rand( - T LO = std::numeric_limits::min(), T HI = std::numeric_limits::max()) +T get_rand(T LO = (std::numeric_limits::min)(), + T HI = (std::numeric_limits::max)()) { return LO + - static_cast(std::rand()) / (static_cast(RAND_MAX / (HI - LO))); + static_cast(std::rand()) / + (static_cast(static_cast((RAND_MAX)) / (HI - LO))); } /////////////////////////////////////////////////////////////////////////////// @@ -42,10 +42,12 @@ void test_reduce1(IteratorTag) { // check if different type for deterministic and nondeeterministic // and same result i.e. correct computation - using base_iterator_det = std::vector::iterator; + using base_iterator_det = + typename std::vector::iterator; using iterator_det = test::test_iterator; - using base_iterator_ndet = std::vector::iterator; + using base_iterator_ndet = + typename std::vector::iterator; using iterator_ndet = test::test_iterator; std::vector deterministic(LEN); @@ -75,8 +77,8 @@ void test_reduce1(IteratorTag) FloatTypeNonDeterministic r3 = std::accumulate( nondeterministic.begin(), nondeterministic.end(), val_non_det); - HPX_TEST_EQ(r1, r3); - HPX_TEST_EQ(r2, r3); + HPX_TEST_EQ(static_cast(r1), r3); + HPX_TEST_EQ(static_cast(r2), r3); } template ::iterator; + using base_iterator_det = + typename std::vector::iterator; using iterator_det = test::test_iterator; constexpr FloatTypeDeterministic num_bounds_det = @@ -129,7 +132,8 @@ template void test_orig_reduce_determinism(IteratorTag) { - using base_iterator_ndet = std::vector::iterator; + using base_iterator_ndet = + typename std::vector::iterator; using iterator_ndet = test::test_iterator; constexpr auto num_bounds_ndet = From c205a13231f4af20fa33eae35c89852e5029c1c6 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Wed, 1 Jan 2025 21:36:17 +0530 Subject: [PATCH 9/9] Fix parallel deterministic reduce and add benchmarks Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 79 +++++++++++++++++++ .../algorithms/reduce_deterministic.hpp | 24 +++--- .../benchmark_reduce_deterministic.cpp | 21 +++-- 3 files changed, 108 insertions(+), 16 deletions(-) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index b37730889172..ffce8febaae3 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -65,6 +65,71 @@ namespace hpx::parallel::detail { } }; + template + struct sequential_reduce_deterministic_rfa_t final + : hpx::functional::detail::tag_fallback< + sequential_reduce_deterministic_rfa_t> + { + private: + template + friend constexpr hpx::parallel::detail::rfa:: + ReproducibleFloatingAccumulator + tag_fallback_invoke(sequential_reduce_deterministic_rfa_t, + ExPolicy&&, InIterB first, std::size_t partition_size, T init, + std::true_type&&) + { + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + + hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + rfa.set_max_abs_val(init); + rfa.unsafe_add(init); + rfa.renorm(); + size_t count = 0; + T max_val = std::abs(*first); + std::size_t partition_size_lim = 0; + for (auto e = first; partition_size_lim <= partition_size; + partition_size_lim++, e++) + { + T temp_max_val = std::abs(static_cast(*e)); + if (max_val < temp_max_val) + { + rfa.set_max_abs_val(temp_max_val); + max_val = temp_max_val; + } + rfa.unsafe_add(*e); + count++; + if (count == rfa.endurance()) + { + rfa.renorm(); + count = 0; + } + } + return rfa; + } + + template + friend constexpr T tag_fallback_invoke( + sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first, + std::size_t partition_size, T init, std::false_type&&) + { + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + + T rfa; + rfa += init; + std::size_t partition_size_lim = 0; + for (auto e = first; partition_size_lim <= partition_size; + partition_size_lim++, e++) + { + rfa += (*e); + } + return rfa; + } + }; + #if !defined(HPX_COMPUTE_DEVICE_CODE) template inline constexpr sequential_reduce_deterministic_t @@ -80,4 +145,18 @@ namespace hpx::parallel::detail { } #endif +#if !defined(HPX_COMPUTE_DEVICE_CODE) + template + inline constexpr sequential_reduce_deterministic_rfa_t + sequential_reduce_deterministic_rfa = + sequential_reduce_deterministic_rfa_t{}; +#else + template + HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic_rfa( + Args&&... args) + { + return sequential_reduce_deterministic_rfa_t{}( + std::forward(args)...); + } +#endif } // namespace hpx::parallel::detail diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp index 996865d519c5..1e59ddba735a 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -407,35 +407,37 @@ namespace hpx::parallel { ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init, Reduce&& r) { + (void)r; if (first == last) { return util::detail::algorithm_result::get( HPX_FORWARD(T_, init)); } - auto f1 = [r, policy]( - FwdIterB part_begin, std::size_t part_size) + auto f1 = [policy](FwdIterB part_begin, std::size_t part_size) -> hpx::parallel::detail::rfa:: ReproducibleFloatingAccumulator { - T val = *part_begin; + T_ val = *part_begin; return hpx::parallel::detail:: - sequential_reduce_deterministic( + sequential_reduce_deterministic_rfa( HPX_FORWARD(ExPolicy, policy), ++part_begin, - --part_size, HPX_MOVE(val), r); + --part_size, HPX_MOVE(val), + std::true_type{}); }; - return util::partitioner>::call(HPX_FORWARD(ExPolicy, policy), first, detail::distance(first, last), HPX_MOVE(f1), - hpx::unwrapping([init = HPX_FORWARD(T_, init), - r = HPX_FORWARD(Reduce, r), - policy](auto&& results) -> T { + hpx::unwrapping([policy](auto&& results) -> T_ { return hpx::parallel::detail:: - sequential_reduce_deterministic( + sequential_reduce_deterministic_rfa( HPX_FORWARD(ExPolicy, policy), hpx::util::begin(results), - hpx::util::size(results), init, r) + hpx::util::size(results), + hpx::parallel::detail::rfa:: + ReproducibleFloatingAccumulator{}, + std::false_type{}) .conv(); })); } diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp index daaee2b1269b..5a267dd6a634 100644 --- a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp @@ -5,6 +5,7 @@ // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #include +#include #if !defined(HPX_COMPUTE_DEVICE_CODE) #include @@ -33,15 +34,15 @@ T get_rand(T LO = (std::numeric_limits::min)(), /////////////////////////////////////////////////////////////////////////////// -void bench_reduce_deterministic( +void bench_reduce_deterministic(const auto& policy, const auto& deterministic_shuffled, const auto& val_det, const auto& op) { // check if different type for deterministic and nondeeterministic // and same result auto r1_shuffled = - hpx::reduce_deterministic((std::begin(deterministic_shuffled)), - (std::end(deterministic_shuffled)), val_det, op); + hpx::reduce_deterministic(policy, std::begin(deterministic_shuffled), + std::end(deterministic_shuffled), val_det, op); HPX_UNUSED(r1_shuffled); } @@ -61,6 +62,7 @@ int hpx_main(hpx::program_options::variables_map& vm) std::srand(seed); auto test_count = vm["test_count"].as(); + std::size_t vector_size = vm["vector-size"].as(); hpx::util::perftests_init(vm); @@ -74,7 +76,7 @@ int hpx_main(hpx::program_options::variables_map& vm) { using FloatTypeDeterministic = float; - std::size_t LEN = 10000; + std::size_t LEN = vector_size; constexpr FloatTypeDeterministic num_bounds_det = std::is_same_v ? 1000.0 : 1000000.0; @@ -113,7 +115,14 @@ int hpx_main(hpx::program_options::variables_map& vm) { hpx::util::perftests_report( "reduce deterministic", "seq", test_count, [&]() { - bench_reduce_deterministic( + bench_reduce_deterministic(hpx::execution::seq, + deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report( + "reduce deterministic", "par", test_count, [&]() { + bench_reduce_deterministic(hpx::execution::par, deterministic_shuffled, val_det, op); }); } @@ -135,6 +144,8 @@ int main(int argc, char* argv[]) cmdline.add_options() ("test_count", value()->default_value(100), "number of tests to be averaged") + ("vector-size", value()->default_value(1000000), + "number of elements to be reduced") ; // clang-format on