diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5b856872c90e..a16f9e14a814 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -105,12 +105,9 @@ jobs:
command: |
git clone --depth=1 file:///hpx/source-full source
- run:
- name: Downloading CTest XML to Junit XML
+ name: Copying CTest XML to Junit XML
command: |
- curl \
- https://raw.githubusercontent.com/Kitware/CDash/master/app/cdash/tests/circle/conv.xsl \
- --fail \
- -o /hpx/conv.xsl
+ cp /hpx/source/.circleci/conv.xsl /hpx/conv.xsl
- persist_to_workspace:
root: /hpx
paths:
diff --git a/.circleci/conv.xsl b/.circleci/conv.xsl
new file mode 100644
index 000000000000..e5b22ded6005
--- /dev/null
+++ b/.circleci/conv.xsl
@@ -0,0 +1,121 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ BuildName:
+ BuildStamp:
+ Name:
+ Generator:
+ CompilerName:
+ OSName:
+ Hostname:
+ OSRelease:
+ OSVersion:
+ OSPlatform:
+ Is64Bits:
+ VendorString:
+ VendorID:
+ FamilyID:
+ ModelID:
+ ProcessorCacheSize:
+ NumberOfLogicalCPU:
+ NumberOfPhysicalCPU:
+ TotalVirtualMemory:
+ TotalPhysicalMemory:
+ LogicalProcessorsPerPhysical:
+ ProcessorClockFrequency:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/.github/workflows/windows_release_gcc_mingw.yml b/.github/workflows/windows_release_gcc_mingw.yml
index 97bb5685f0c3..b36c02773eda 100644
--- a/.github/workflows/windows_release_gcc_mingw.yml
+++ b/.github/workflows/windows_release_gcc_mingw.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 The STE||AR-Group
+# Copyright (c) 2023-2024 The STE||AR-Group
#
# SPDX-License-Identifier: BSL-1.0
# Distributed under the Boost Software License, Version 1.0. (See accompanying
@@ -22,7 +22,7 @@ jobs:
choco install ninja -y
md C:\projects
$client = new-object System.Net.WebClient
- $client.DownloadFile("https://master.dl.sourceforge.net/project/boost/boost/1.78.0/boost_1_78_0.7z","C:\projects\boost_1_78_0.7z")
+ $client.DownloadFile("https://archives.boost.io/release/1.78.0/source/boost_1_78_0.7z","C:\projects\boost_1_78_0.7z")
7z x C:\projects\boost_1_78_0.7z -y -oC:\projects\boost
cd C:\projects\boost\boost_1_78_0
.\bootstrap.bat gcc
diff --git a/cmake/HPX_SetupBoost.cmake b/cmake/HPX_SetupBoost.cmake
index 43360e5571db..256b56e169f8 100644
--- a/cmake/HPX_SetupBoost.cmake
+++ b/cmake/HPX_SetupBoost.cmake
@@ -117,13 +117,6 @@ if(NOT TARGET hpx_dependencies_boost)
endif()
set(__boost_libraries "")
- if(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING
- OR HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE
- )
- set(__boost_libraries ${__boost_libraries} log log_setup date_time chrono
- thread
- )
- endif()
if(HPX_WITH_GENERIC_CONTEXT_COROUTINES)
# if context is needed, we should still link with boost thread and chrono
diff --git a/cmake/toolchains/Cray.cmake b/cmake/toolchains/Cray.cmake
index 83b9c051f133..e2f369f063ff 100644
--- a/cmake/toolchains/Cray.cmake
+++ b/cmake/toolchains/Cray.cmake
@@ -70,27 +70,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED
CACHE BOOL ""
)
-set(HPX_WITH_PARCELPORT_LIBFABRIC
- ON
- CACHE BOOL ""
-)
-set(HPX_PARCELPORT_LIBFABRIC_PROVIDER
- "gni"
- CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc"
-)
-set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS
- "256"
- CACHE STRING "Max number of messages in flight at once"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE
- OFF
- CACHE BOOL "Custom libfabric logging flag"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING
- OFF
- CACHE BOOL "Libfabric parcelport logging on/off flag"
-)
-
# We do a cross compilation here ...
set(CMAKE_CROSSCOMPILING
ON
diff --git a/cmake/toolchains/CrayKNL.cmake b/cmake/toolchains/CrayKNL.cmake
index 126bcc9a0385..17d06245d37f 100644
--- a/cmake/toolchains/CrayKNL.cmake
+++ b/cmake/toolchains/CrayKNL.cmake
@@ -68,27 +68,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED
CACHE BOOL ""
)
-set(HPX_WITH_PARCELPORT_LIBFABRIC
- ON
- CACHE BOOL ""
-)
-set(HPX_PARCELPORT_LIBFABRIC_PROVIDER
- "gni"
- CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc"
-)
-set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS
- "256"
- CACHE STRING "Max number of messages in flight at once"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE
- OFF
- CACHE BOOL "Custom libfabric logging flag"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING
- OFF
- CACHE BOOL "Libfabric parcelport logging on/off flag"
-)
-
# Set the TBBMALLOC_PLATFORM correctly so that find_package(TBBMalloc) sets the
# right hints
set(TBBMALLOC_PLATFORM
diff --git a/cmake/toolchains/CrayKNLStatic.cmake b/cmake/toolchains/CrayKNLStatic.cmake
index 97843059eaa7..76e6160ba239 100644
--- a/cmake/toolchains/CrayKNLStatic.cmake
+++ b/cmake/toolchains/CrayKNLStatic.cmake
@@ -52,27 +52,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED
CACHE BOOL ""
)
-set(HPX_WITH_PARCELPORT_LIBFABRIC
- ON
- CACHE BOOL ""
-)
-set(HPX_PARCELPORT_LIBFABRIC_PROVIDER
- "gni"
- CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc"
-)
-set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS
- "256"
- CACHE STRING "Max number of messages in flight at once"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE
- OFF
- CACHE BOOL "Custom libfabric logging flag"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING
- OFF
- CACHE BOOL "Libfabric parcelport logging on/off flag"
-)
-
# Set the TBBMALLOC_PLATFORM correctly so that find_package(TBBMalloc) sets the
# right hints
set(TBBMALLOC_PLATFORM
diff --git a/cmake/toolchains/CrayStatic.cmake b/cmake/toolchains/CrayStatic.cmake
index f89757a2e72c..6d1bc2061085 100644
--- a/cmake/toolchains/CrayStatic.cmake
+++ b/cmake/toolchains/CrayStatic.cmake
@@ -62,24 +62,3 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED
ON
CACHE BOOL ""
)
-
-set(HPX_WITH_PARCELPORT_LIBFABRIC
- ON
- CACHE BOOL ""
-)
-set(HPX_PARCELPORT_LIBFABRIC_PROVIDER
- "gni"
- CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc"
-)
-set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS
- "256"
- CACHE STRING "Max number of messages in flight at once"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE
- OFF
- CACHE BOOL "Custom libfabric logging flag"
-)
-set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING
- OFF
- CACHE BOOL "Libfabric parcelport logging on/off flag"
-)
diff --git a/libs/core/algorithms/CMakeLists.txt b/libs/core/algorithms/CMakeLists.txt
index 6fcfed897e2f..9090345722df 100644
--- a/libs/core/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/CMakeLists.txt
@@ -37,7 +37,9 @@ set(algorithms_headers
hpx/parallel/algorithms/detail/parallel_stable_sort.hpp
hpx/parallel/algorithms/detail/pivot.hpp
hpx/parallel/algorithms/detail/reduce.hpp
+ hpx/parallel/algorithms/detail/reduce_deterministic.hpp
hpx/parallel/algorithms/detail/replace.hpp
+ hpx/parallel/algorithms/detail/rfa.hpp
hpx/parallel/algorithms/detail/rotate.hpp
hpx/parallel/algorithms/detail/sample_sort.hpp
hpx/parallel/algorithms/detail/search.hpp
@@ -72,6 +74,7 @@ set(algorithms_headers
hpx/parallel/algorithms/partition.hpp
hpx/parallel/algorithms/reduce_by_key.hpp
hpx/parallel/algorithms/reduce.hpp
+ hpx/parallel/algorithms/reduce_deterministic.hpp
hpx/parallel/algorithms/remove_copy.hpp
hpx/parallel/algorithms/remove.hpp
hpx/parallel/algorithms/replace.hpp
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index b2de030eed8b..b37730889172 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -11,9 +11,9 @@
#include
#include
#include
-#include
#include
+#include
#include
#include
#include
@@ -33,9 +33,11 @@ namespace hpx::parallel::detail {
sequential_reduce_deterministic_t, ExPolicy&&, InIterB first,
InIterE last, T init, Reduce&& r)
{
+ /// TODO: Put constraint on Reduce to be a binary plus operator
+ (void) r;
hpx::parallel::detail::rfa::RFA_bins bins;
bins.initialize_bins();
- std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+ std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa;
rfa.set_max_abs_val(init);
@@ -63,84 +65,6 @@ namespace hpx::parallel::detail {
}
};
- template
- struct sequential_reduce_deterministic_rfa_t final
- : hpx::functional::detail::tag_fallback<
- sequential_reduce_deterministic_rfa_t>
- {
- private:
- template
- friend constexpr hpx::parallel::detail::rfa::
- ReproducibleFloatingAccumulator
- tag_fallback_invoke(sequential_reduce_deterministic_rfa_t,
- ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r)
- {
- hpx::parallel::detail::rfa::RFA_bins bins;
- bins.initialize_bins();
- std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
-
- hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa;
-
- for (auto e = first; e != last; ++e)
- {
- rfa += *e;
- }
- return rfa;
- }
-
- template
- friend constexpr hpx::parallel::detail::rfa::
- ReproducibleFloatingAccumulator
- tag_fallback_invoke(sequential_reduce_deterministic_rfa_t,
- ExPolicy&&, InIterB first, std::size_t size, T init, Reduce&& r)
- {
- hpx::parallel::detail::rfa::RFA_bins bins;
- bins.initialize_bins();
- std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
-
- hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa;
- auto e = first;
- for (std::size_t i = 0; i < size; ++i, ++e)
- {
- rfa += *e;
- }
- return rfa;
- }
-
- // template ,
- // // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
- // // double>>::value>
- // >
- // friend constexpr T tag_fallback_invoke(
- // sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first,
- // InIterE last, T init, Reduce&& r)
- // {
- // static_assert(hpx::util::contains,
- // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
- // double>>::value);
- // hpx::parallel::detail::rfa::RFA_bins bins;
- // bins.initialize_bins();
- // std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
-
- // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa;
- // rfa.set_max_abs_val(init);
- // rfa.unsafe_add(init);
- // rfa.renorm();
- // for (auto e = first; e != last; ++e)
- // {
- // rfa += *e;
- // }
- // return rfa.conv();
- // }
- };
-
#if !defined(HPX_COMPUTE_DEVICE_CODE)
template
inline constexpr sequential_reduce_deterministic_t
@@ -156,18 +80,4 @@ namespace hpx::parallel::detail {
}
#endif
-#if !defined(HPX_COMPUTE_DEVICE_CODE)
- template
- inline constexpr sequential_reduce_deterministic_rfa_t
- sequential_reduce_deterministic_rfa =
- sequential_reduce_deterministic_rfa_t{};
-#else
- template
- HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic_rfa(
- Args&&... args)
- {
- return sequential_reduce_deterministic_rfa_t{}(
- std::forward(args)...);
- }
-#endif
} // namespace hpx::parallel::detail
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
index 4a9910cb6faa..fa9142cdf80b 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -1,3 +1,34 @@
+// Copyright (c) 2024 Shreyas Atre
+//
+// SPDX-License-Identifier: BSL-1.0
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// ---------------------------------------------------------------------------
+// This file has been taken from
+// https://github.com/maddyscientist/reproducible_floating_sums commit
+// b5a065741d4ea459437ca004b508de9dcb6a3e52. The boost copyright has been added
+// to this file in accordance with the dual license terms for the Reproducible
+// Floating-Point Summations and conformance with the HPX policy
+// https://github.com/maddyscientist/reproducible_floating_sums/blob/feature/cuda/LICENSE.md
+// ---------------------------------------------------------------------------
+//
+/// Copyright 2022 Richard Barnes, Peter Ahrens, James Demmel
+/// Permission is hereby granted, free of charge, to any person obtaining a copy
+/// of this software and associated documentation files (the "Software"), to deal
+/// in the Software without restriction, including without limitation the rights
+/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+/// copies of the Software, and to permit persons to whom the Software is
+/// furnished to do so, subject to the following conditions:
+/// The above copyright notice and this permission notice shall be included in
+/// all copies or substantial portions of the Software.
+/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+/// SOFTWARE.
//Reproducible Floating Point Accumulations via Binned Floating Point
//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
@@ -26,6 +57,10 @@
#include
#include
#include
+#include
+#include
+
+#include
namespace hpx::parallel::detail::rfa {
template
@@ -163,7 +198,7 @@ namespace hpx::parallel::detail::rfa {
}
};
- static char bin_host_buffer[sizeof(RFA_bins)];
+ static char __rfa_bin_host_buffer__[sizeof(RFA_bins)];
///Class to hold a reproducible summation of the numbers passed to it
///
@@ -179,7 +214,7 @@ namespace hpx::parallel::detail::rfa {
static constexpr int FOLD = FOLD_;
private:
- std::array data = {0};
+ std::array data = {{0}};
///Floating-point precision bin width
static constexpr auto BIN_WIDTH =
@@ -214,7 +249,8 @@ namespace hpx::parallel::detail::rfa {
///Return a binned floating-point reference bin
inline const ftype* binned_bins(const int x) const
{
- return &reinterpret_cast&>(bin_host_buffer)[x];
+ return &reinterpret_cast&>(
+ __rfa_bin_host_buffer__)[x];
}
///Get the bit representation of a float
@@ -350,21 +386,21 @@ namespace hpx::parallel::detail::rfa {
///Get index of float-point precision
///The index of a non-binned type is the smallest index a binned type would
- ///need to have to sum it reproducibly. Higher indicies correspond to smaller
+ ///need to have to sum it reproducibly. Higher indices correspond to smaller
///bins.
static inline constexpr int binned_dindex(const ftype x)
{
int exp = EXP(x);
if (exp == 0)
{
- if (x == 0.0)
+ if (x == static_cast(0.0))
{
return MAXINDEX;
}
else
{
std::frexp(x, &exp);
- return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+ return (std::max)((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
}
}
return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
@@ -372,7 +408,7 @@ namespace hpx::parallel::detail::rfa {
///Get index of manually specified binned double precision
///The index of a binned type is the bin that it corresponds to. Higher
- ///indicies correspond to smaller bins.
+ ///indices correspond to smaller bins.
inline int binned_index() const
{
return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
@@ -415,7 +451,9 @@ namespace hpx::parallel::detail::rfa {
int shift = binned_index() - X_index;
if (shift > 0)
{
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+ HPX_UNROLL
+#endif
for (int i = FOLD - 1; i >= 1; i--)
{
if (i < shift)
@@ -424,7 +462,9 @@ namespace hpx::parallel::detail::rfa {
carry(i * inccarY) = carry((i - shift) * inccarY);
}
const ftype* const bins = binned_bins(X_index);
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+ HPX_UNROLL
+#endif
for (int j = 0; j < FOLD; j++)
{
if (j >= shift)
@@ -456,16 +496,19 @@ namespace hpx::parallel::detail::rfa {
if (binned_index0())
{
M = primary(0);
- ftype qd = x * COMPRESSION;
+ ftype qd = x * static_cast(COMPRESSION);
auto& ql = get_bits(qd);
ql |= 1;
qd += M;
primary(0) = qd;
M -= qd;
- M *= EXPANSION * 0.5;
+ auto temp_m = (double) (((double) EXPANSION) * 0.5);
+ M *= static_cast(temp_m);
x += M;
x += M;
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+ HPX_UNROLL
+#endif
for (int i = 1; i < FOLD - 1; i++)
{
M = primary(i * incpriY);
@@ -484,7 +527,9 @@ namespace hpx::parallel::detail::rfa {
{
ftype qd = x;
auto& ql = get_bits(qd);
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+ HPX_UNROLL
+#endif
for (int i = 0; i < FOLD - 1; i++)
{
M = primary(i * incpriY);
@@ -549,7 +594,7 @@ namespace hpx::parallel::detail::rfa {
int i = 0;
if (ISNANINF(primary(0)))
- return primary(0);
+ return (double) primary(0);
if (ISZERO(primary(0)))
return 0.0;
@@ -563,29 +608,36 @@ namespace hpx::parallel::detail::rfa {
{
scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
- scaled = std::max(
- std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
+ scaled = (std::max)(
+ (std::min)(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
if (X_index == 0)
{
- Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
- Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
- Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
+ Y += ((double) carry(0)) *
+ ((((double) bins[0]) / 6.0) * scale_down * EXPANSION);
+ Y += ((double) carry(inccarX)) *
+ ((((double) bins[1]) / 6.0) * scale_down);
+ Y += ((double) primary(0) - (double) bins[0]) * scale_down *
+ EXPANSION;
i = 2;
}
else
{
- Y += carry(0) * ((bins[0] / 6.0) * scale_down);
+ Y += ((double) carry(0)) *
+ (((double) bins[0] / 6.0) * scale_down);
i = 1;
}
for (; i < scaled; i++)
{
- Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
- Y +=
- (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
+ Y += ((double) carry(i * inccarX)) *
+ (((double) bins[i] / 6.0) * scale_down);
+ Y += ((double) primary((i - 1) * incpriX) -
+ (double) (bins[i - 1])) *
+ scale_down;
}
if (i == FOLD)
{
- Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
+ Y += ((double) primary((FOLD - 1) * incpriX) -
+ (double) (bins[FOLD - 1])) *
scale_down;
return Y * scale_up;
}
@@ -596,20 +648,23 @@ namespace hpx::parallel::detail::rfa {
Y *= scale_up;
for (; i < FOLD; i++)
{
- Y += carry(i * inccarX) * (bins[i] / 6.0);
- Y += primary((i - 1) * incpriX) - bins[i - 1];
+ Y += ((double) carry(i * inccarX)) *
+ ((double) bins[i] / 6.0);
+ Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
}
- Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
+ Y += ((double) primary((FOLD - 1) * incpriX) -
+ ((double) bins[FOLD - 1]));
}
else
{
- Y += carry(0) * (bins[0] / 6.0);
+ Y += ((double) carry(0)) * ((double) bins[0] / 6.0);
for (i = 1; i < FOLD; i++)
{
- Y += carry(i * inccarX) * (bins[i] / 6.0);
- Y += (primary((i - 1) * incpriX) - bins[i - 1]);
+ Y += ((double) carry(i * inccarX)) *
+ ((double) bins[i] / 6.0);
+ Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
}
- Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+ Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
}
return Y;
}
@@ -626,7 +681,7 @@ namespace hpx::parallel::detail::rfa {
if (ISNANINF(primary(0)))
return primary(0);
if (ISZERO(primary(0)))
- return 0.0;
+ return 0.0f;
//Note that the following order of summation is in order of decreasing
//exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
@@ -635,20 +690,22 @@ namespace hpx::parallel::detail::rfa {
const auto* const bins = binned_bins(X_index);
if (X_index == 0)
{
- Y += (double) carry(0) * (double) (bins[0] / 6.0) *
+ Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0) *
(double) EXPANSION;
- Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
+ Y += (double) carry(inccarX) *
+ (double) (((double) bins[1]) / 6.0);
Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
i = 2;
}
else
{
- Y += (double) carry(0) * (double) (bins[0] / 6.0);
+ Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0);
i = 1;
}
for (; i < FOLD; i++)
{
- Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
+ Y += (double) carry(i * inccarX) *
+ (double) (((double) bins[i]) / 6.0);
Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
}
Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
@@ -693,8 +750,10 @@ namespace hpx::parallel::detail::rfa {
if (shift > 0)
{
const auto* const bins = binned_bins(Y_index);
- //shift Y upwards and add X to Y
-#pragma unroll
+//shift Y upwards and add X to Y
+#if !defined(HPX_CLANG_VERSION)
+ HPX_UNROLL
+#endif
for (int i = FOLD - 1; i >= 1; i--)
{
if (i < shift)
@@ -704,7 +763,9 @@ namespace hpx::parallel::detail::rfa {
carry(i * inccarY) =
x.carry(i * inccarX) + carry((i - shift) * inccarY);
}
-#pragma unroll
+#if !defined(HPX_CLANG_VERSION)
+ HPX_UNROLL
+#endif
for (int i = 0; i < FOLD; i++)
{
if (i == shift)
@@ -716,8 +777,10 @@ namespace hpx::parallel::detail::rfa {
else if (shift < 0)
{
const auto* const bins = binned_bins(X_index);
- //shift X upwards and add X to Y
-#pragma unroll
+//shift X upwards and add X to Y
+#if !defined(HPX_CLANG_VERSION)
+ HPX_UNROLL
+#endif
for (int i = 0; i < FOLD; i++)
{
if (i < -shift)
@@ -730,8 +793,10 @@ namespace hpx::parallel::detail::rfa {
else if (shift == 0)
{
const auto* const bins = binned_bins(X_index);
- // add X to Y
-#pragma unroll
+// add X to Y
+#if !defined(HPX_CLANG_VERSION)
+ HPX_UNROLL
+#endif
for (int i = 0; i < FOLD; i++)
{
primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
@@ -770,7 +835,7 @@ namespace hpx::parallel::detail::rfa {
}
///Return the endurance of the binned fp
- constexpr int endurance() const
+ constexpr size_t endurance() const
{
return ENDURANCE;
}
@@ -866,11 +931,11 @@ namespace hpx::parallel::detail::rfa {
{
if (std::is_same_v)
{
- return binned_conv_single(1, 1);
+ return static_cast(binned_conv_single(1, 1));
}
else
{
- return binned_conv_double(1, 1);
+ return static_cast(binned_conv_double(1, 1));
}
}
@@ -887,7 +952,8 @@ namespace hpx::parallel::detail::rfa {
{
const double X = std::abs(max_abs_val);
const double S = std::abs(binned_sum);
- return static_cast(max(X, std::ldexp(0.5, MIN_EXP - 1)) *
+ return static_cast(
+ (std::max)(X, std::ldexp(0.5, MIN_EXP - 1)) *
std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
((7.0 * EPSILON) /
(1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) -
@@ -972,7 +1038,7 @@ namespace hpx::parallel::detail::rfa {
T max_abs_val = input[0];
for (size_t i = 0; i < N; i++)
{
- max_abs_val = max(max_abs_val, std::abs(input[i]));
+ max_abs_val = (std::max)(max_abs_val, std::abs(input[i]));
}
add(input, N, max_abs_val);
}
@@ -1141,4 +1207,4 @@ namespace hpx::parallel::detail::rfa {
}
};
-} // namespace hpx::parallel::detail::rfa
\ No newline at end of file
+} // namespace hpx::parallel::detail::rfa
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
index b8435eec9a02..996865d519c5 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -10,7 +10,6 @@
#pragma once
-#include "detail/reduce_deterministic.hpp"
#if defined(DOXYGEN)
namespace hpx {
@@ -420,7 +419,7 @@ namespace hpx::parallel {
ReproducibleFloatingAccumulator {
T val = *part_begin;
return hpx::parallel::detail::
- sequential_reduce_deterministic_rfa(
+ sequential_reduce_deterministic(
HPX_FORWARD(ExPolicy, policy), ++part_begin,
--part_size, HPX_MOVE(val), r);
};
@@ -433,7 +432,7 @@ namespace hpx::parallel {
r = HPX_FORWARD(Reduce, r),
policy](auto&& results) -> T {
return hpx::parallel::detail::
- sequential_reduce_deterministic_rfa(
+ sequential_reduce_deterministic(
HPX_FORWARD(ExPolicy, policy),
hpx::util::begin(results),
hpx::util::size(results), init, r)
diff --git a/libs/core/algorithms/tests/performance/CMakeLists.txt b/libs/core/algorithms/tests/performance/CMakeLists.txt
index d74788a9b47f..96ce826dc742 100644
--- a/libs/core/algorithms/tests/performance/CMakeLists.txt
+++ b/libs/core/algorithms/tests/performance/CMakeLists.txt
@@ -16,6 +16,7 @@ set(benchmarks
benchmark_partial_sort_parallel
benchmark_partition
benchmark_partition_copy
+ benchmark_reduce_deterministic
benchmark_remove
benchmark_remove_if
benchmark_scan_algorithms
diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
new file mode 100644
index 000000000000..daaee2b1269b
--- /dev/null
+++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
@@ -0,0 +1,148 @@
+// Copyright (c) 2024 Shreyas Atre
+//
+// SPDX-License-Identifier: BSL-1.0
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include
+
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+int seed = 1000;
+std::mt19937 gen(seed);
+
+template
+T get_rand(T LO = (std::numeric_limits::min)(),
+ T HI = (std::numeric_limits::max)())
+{
+ return LO +
+ static_cast(std::rand()) /
+ (static_cast(static_cast((RAND_MAX)) / (HI - LO)));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void bench_reduce_deterministic(
+ const auto& deterministic_shuffled, const auto& val_det, const auto& op)
+{
+ // check if different type for deterministic and nondeeterministic
+ // and same result
+
+ auto r1_shuffled =
+ hpx::reduce_deterministic((std::begin(deterministic_shuffled)),
+ (std::end(deterministic_shuffled)), val_det, op);
+
+ HPX_UNUSED(r1_shuffled);
+}
+
+void bench_reduce(const auto& policy, const auto& deterministic_shuffled,
+ const auto& val_det, const auto& op)
+{
+ auto r = hpx::reduce(policy, (std::begin(deterministic_shuffled)),
+ (std::end(deterministic_shuffled)), val_det, op);
+
+ HPX_UNUSED(r);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+int hpx_main(hpx::program_options::variables_map& vm)
+{
+ std::srand(seed);
+
+ auto test_count = vm["test_count"].as();
+
+ hpx::util::perftests_init(vm);
+
+ // verify that input is within domain of program
+ if (test_count == 0 || test_count < 0)
+ {
+ std::cerr << "test_count cannot be zero or negative...\n" << std::flush;
+ hpx::local::finalize();
+ return -1;
+ }
+
+ {
+ using FloatTypeDeterministic = float;
+ std::size_t LEN = 10000;
+
+ constexpr FloatTypeDeterministic num_bounds_det =
+ std::is_same_v ? 1000.0 : 1000000.0;
+
+ std::vector deterministic(LEN);
+
+ for (size_t i = 0; i < LEN; ++i)
+ {
+ deterministic[i] = get_rand(
+ -num_bounds_det, num_bounds_det);
+ }
+
+ std::vector deterministic_shuffled =
+ deterministic;
+
+ std::shuffle(
+ deterministic_shuffled.begin(), deterministic_shuffled.end(), gen);
+
+ FloatTypeDeterministic val_det(41.999);
+
+ auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) {
+ return v1 + v2;
+ };
+ {
+ hpx::util::perftests_report("reduce", "seq", test_count, [&]() {
+ bench_reduce(
+ hpx::execution::seq, deterministic_shuffled, val_det, op);
+ });
+ }
+ {
+ hpx::util::perftests_report("reduce", "par", test_count, [&]() {
+ bench_reduce(
+ hpx::execution::par, deterministic_shuffled, val_det, op);
+ });
+ }
+ {
+ hpx::util::perftests_report(
+ "reduce deterministic", "seq", test_count, [&]() {
+ bench_reduce_deterministic(
+ deterministic_shuffled, val_det, op);
+ });
+ }
+
+ hpx::util::perftests_print_times();
+ }
+
+ return hpx::local::finalize();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+int main(int argc, char* argv[])
+{
+ using namespace hpx::program_options;
+
+ options_description cmdline("usage: " HPX_APPLICATION_STRING " [options]");
+
+ // clang-format off
+ cmdline.add_options()
+ ("test_count", value()->default_value(100),
+ "number of tests to be averaged")
+ ;
+ // clang-format on
+
+ hpx::util::perftests_cfg(cmdline);
+ hpx::local::init_params init_args;
+ init_args.desc_cmdline = cmdline;
+ init_args.cfg = {"hpx.os_threads=all"};
+
+ return hpx::local::init(hpx_main, argc, argv, init_args);
+}
+#endif
diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
index 76dc5fcd9806..559ee830030e 100644
--- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
@@ -246,7 +246,3 @@ foreach(test ${tests})
"modules.algorithms.algorithms" ${test} ${${test}_PARAMETERS}
)
endforeach()
-
-target_compile_options(reduce_deterministic_test PRIVATE -fsanitize=address)
-
-target_link_options(reduce_deterministic_test PRIVATE -fsanitize=address)
\ No newline at end of file
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 694b50cfb76d..5a06c509efdc 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -4,8 +4,6 @@
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-#pragma once
-
#include
#include
#include
@@ -19,6 +17,7 @@
#include
#include
#include
+#include
#include
#include "test_utils.hpp"
@@ -27,11 +26,12 @@ int seed = std::random_device{}();
std::mt19937 gen(seed);
template
-T get_rand(
- T LO = std::numeric_limits::min(), T HI = std::numeric_limits::max())
+T get_rand(T LO = (std::numeric_limits::min)(),
+ T HI = (std::numeric_limits::max)())
{
return LO +
- static_cast(std::rand()) / (static_cast(RAND_MAX / (HI - LO)));
+ static_cast(std::rand()) /
+ (static_cast(static_cast((RAND_MAX)) / (HI - LO)));
}
///////////////////////////////////////////////////////////////////////////////
@@ -42,10 +42,12 @@ void test_reduce1(IteratorTag)
{
// check if different type for deterministic and nondeeterministic
// and same result i.e. correct computation
- using base_iterator_det = std::vector::iterator;
+ using base_iterator_det =
+ typename std::vector::iterator;
using iterator_det = test::test_iterator;
- using base_iterator_ndet = std::vector::iterator;
+ using base_iterator_ndet =
+ typename std::vector::iterator;
using iterator_ndet = test::test_iterator;
std::vector deterministic(LEN);
@@ -75,51 +77,8 @@ void test_reduce1(IteratorTag)
FloatTypeNonDeterministic r3 = std::accumulate(
nondeterministic.begin(), nondeterministic.end(), val_non_det);
- HPX_TEST_EQ(r1, r3);
- HPX_TEST_EQ(r2, r3);
-}
-
-template
-void test_reduce_parallel1(IteratorTag)
-{
- // check if different type for deterministic and nondeeterministic
- // and same result i.e. correct computation
- using base_iterator_det = std::vector::iterator;
- using iterator_det = test::test_iterator;
-
- using base_iterator_ndet = std::vector::iterator;
- using iterator_ndet = test::test_iterator;
-
- std::vector deterministic(LEN);
- std::vector nondeterministic(LEN);
-
- std::iota(
- deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0));
-
- std::iota(nondeterministic.begin(), nondeterministic.end(),
- FloatTypeNonDeterministic(0));
-
- FloatTypeDeterministic val_det(0);
- FloatTypeNonDeterministic val_non_det(0);
- auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
- return v1 + v2;
- };
-
- FloatTypeDeterministic r1 = hpx::reduce_deterministic(hpx::execution::par,
- iterator_det(std::begin(deterministic)),
- iterator_det(std::end(deterministic)), val_det, op);
-
- // verify values
- // FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::par,
- // iterator_ndet(std::begin(nondeterministic)),
- // iterator_ndet(std::end(nondeterministic)), val_non_det, op);
-
- FloatTypeNonDeterministic r3 = std::accumulate(
- nondeterministic.begin(), nondeterministic.end(), val_non_det);
-
- HPX_TEST_EQ(r1, r3);
- // HPX_TEST_EQ(r2, r3);
+ HPX_TEST_EQ(static_cast(r1), r3);
+ HPX_TEST_EQ(static_cast(r2), r3);
}
template ::iterator;
+ using base_iterator_det =
+ typename std::vector::iterator;
using iterator_det = test::test_iterator;
- constexpr auto num_bounds_det =
- std::is_same_v ? 1000.0f : 1000000.0f;
+ constexpr FloatTypeDeterministic num_bounds_det =
+ std::is_same_v ? 1000.0 : 1000000.0;
std::vector deterministic(LEN);
@@ -165,11 +125,15 @@ void test_reduce_determinism(IteratorTag)
r1_shuffled); // Deterministically calculated, should always satisfy
}
+/// This test function is never called because it is not guaranteed to pass
+/// It serves an important purpose to demonstrate that floating point summation
+/// is not always associative i.e. a+b+c != a+c+b
template
void test_orig_reduce_determinism(IteratorTag)
{
- using base_iterator_ndet = std::vector::iterator;
+ using base_iterator_ndet =
+ typename std::vector::iterator;
using iterator_ndet = test::test_iterator;
constexpr auto num_bounds_ndet =
@@ -221,7 +185,6 @@ void test_reduce1()
test_reduce1(IteratorTag());
test_reduce1(IteratorTag());
test_reduce1(IteratorTag());
- test_reduce_parallel1(IteratorTag());
}
template
@@ -233,21 +196,22 @@ void test_reduce2()
test_reduce_determinism(IteratorTag());
}
-template
-void test_reduce3()
-{
- using namespace hpx::execution;
+// template
+// void test_reduce3()
+// {
+// using namespace hpx::execution;
- test_orig_reduce_determinism(IteratorTag());
- test_orig_reduce_determinism(IteratorTag());
-}
+// test_orig_reduce_determinism(IteratorTag());
+// test_orig_reduce_determinism(IteratorTag());
+// }
void reduce_test1()
{
test_reduce1();
test_reduce2();
- test_reduce3();
- // test_reduce1();
+ // test_reduce3();
+ test_reduce1();
+ test_reduce2();
}
///////////////////////////////////////////////////////////////////////////////
diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
index d86fc5775415..b29652a3ede1 100644
--- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp
+++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
@@ -25,7 +25,7 @@ void tagged_ptr_test()
i = j;
HPX_TEST_EQ(i.get_ptr(), &b);
- HPX_TEST_EQ(i.get_tag(), 1UL);
+ HPX_TEST_EQ(i.get_tag(), 1);
}
{
@@ -43,7 +43,7 @@ void tagged_ptr_test()
{
tagged_ptr j(&a, max_tag);
- HPX_TEST_EQ(j.get_next_tag(), 0UL);
+ HPX_TEST_EQ(j.get_next_tag(), 0);
}
{
diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp
index 8a01d9574853..3d7cf5da2aa0 100644
--- a/libs/core/debugging/src/print.cpp
+++ b/libs/core/debugging/src/print.cpp
@@ -57,10 +57,6 @@ namespace hpx::debug {
std::ostream&, std::int32_t const&, int);
template HPX_CORE_EXPORT void print_dec(
std::ostream&, std::int64_t const&, int);
-#ifdef __APPLE__
- template HPX_CORE_EXPORT void print_dec(
- std::ostream&, unsigned long const&, int);
-#endif
template HPX_CORE_EXPORT void print_dec(
std::ostream&, std::uint64_t const&, int);