diff --git a/.circleci/config.yml b/.circleci/config.yml index 5b856872c90e..a16f9e14a814 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -105,12 +105,9 @@ jobs: command: | git clone --depth=1 file:///hpx/source-full source - run: - name: Downloading CTest XML to Junit XML + name: Copying CTest XML to Junit XML command: | - curl \ - https://raw.githubusercontent.com/Kitware/CDash/master/app/cdash/tests/circle/conv.xsl \ - --fail \ - -o /hpx/conv.xsl + cp /hpx/source/.circleci/conv.xsl /hpx/conv.xsl - persist_to_workspace: root: /hpx paths: diff --git a/.circleci/conv.xsl b/.circleci/conv.xsl new file mode 100644 index 000000000000..e5b22ded6005 --- /dev/null +++ b/.circleci/conv.xsl @@ -0,0 +1,121 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BuildName: + BuildStamp: + Name: + Generator: + CompilerName: + OSName: + Hostname: + OSRelease: + OSVersion: + OSPlatform: + Is64Bits: + VendorString: + VendorID: + FamilyID: + ModelID: + ProcessorCacheSize: + NumberOfLogicalCPU: + NumberOfPhysicalCPU: + TotalVirtualMemory: + TotalPhysicalMemory: + LogicalProcessorsPerPhysical: + ProcessorClockFrequency: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.github/workflows/windows_release_gcc_mingw.yml b/.github/workflows/windows_release_gcc_mingw.yml index 97bb5685f0c3..b36c02773eda 100644 --- a/.github/workflows/windows_release_gcc_mingw.yml +++ b/.github/workflows/windows_release_gcc_mingw.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2023 The STE||AR-Group +# Copyright (c) 2023-2024 The STE||AR-Group # # SPDX-License-Identifier: BSL-1.0 # Distributed under the Boost Software License, Version 1.0. (See accompanying @@ -22,7 +22,7 @@ jobs: choco install ninja -y md C:\projects $client = new-object System.Net.WebClient - $client.DownloadFile("https://master.dl.sourceforge.net/project/boost/boost/1.78.0/boost_1_78_0.7z","C:\projects\boost_1_78_0.7z") + $client.DownloadFile("https://archives.boost.io/release/1.78.0/source/boost_1_78_0.7z","C:\projects\boost_1_78_0.7z") 7z x C:\projects\boost_1_78_0.7z -y -oC:\projects\boost cd C:\projects\boost\boost_1_78_0 .\bootstrap.bat gcc diff --git a/cmake/HPX_SetupBoost.cmake b/cmake/HPX_SetupBoost.cmake index 43360e5571db..256b56e169f8 100644 --- a/cmake/HPX_SetupBoost.cmake +++ b/cmake/HPX_SetupBoost.cmake @@ -117,13 +117,6 @@ if(NOT TARGET hpx_dependencies_boost) endif() set(__boost_libraries "") - if(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING - OR HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE - ) - set(__boost_libraries ${__boost_libraries} log log_setup date_time chrono - thread - ) - endif() if(HPX_WITH_GENERIC_CONTEXT_COROUTINES) # if context is needed, we should still link with boost thread and chrono diff --git a/cmake/toolchains/Cray.cmake b/cmake/toolchains/Cray.cmake index 83b9c051f133..e2f369f063ff 100644 --- a/cmake/toolchains/Cray.cmake +++ b/cmake/toolchains/Cray.cmake @@ -70,27 +70,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED CACHE BOOL "" ) -set(HPX_WITH_PARCELPORT_LIBFABRIC - ON - CACHE BOOL "" -) -set(HPX_PARCELPORT_LIBFABRIC_PROVIDER - "gni" - CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc" -) -set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS - "256" - CACHE STRING "Max number of messages in flight at once" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE - OFF - CACHE BOOL "Custom libfabric logging flag" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING - OFF - CACHE BOOL "Libfabric parcelport logging on/off flag" -) - # We do a cross compilation here ... set(CMAKE_CROSSCOMPILING ON diff --git a/cmake/toolchains/CrayKNL.cmake b/cmake/toolchains/CrayKNL.cmake index 126bcc9a0385..17d06245d37f 100644 --- a/cmake/toolchains/CrayKNL.cmake +++ b/cmake/toolchains/CrayKNL.cmake @@ -68,27 +68,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED CACHE BOOL "" ) -set(HPX_WITH_PARCELPORT_LIBFABRIC - ON - CACHE BOOL "" -) -set(HPX_PARCELPORT_LIBFABRIC_PROVIDER - "gni" - CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc" -) -set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS - "256" - CACHE STRING "Max number of messages in flight at once" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE - OFF - CACHE BOOL "Custom libfabric logging flag" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING - OFF - CACHE BOOL "Libfabric parcelport logging on/off flag" -) - # Set the TBBMALLOC_PLATFORM correctly so that find_package(TBBMalloc) sets the # right hints set(TBBMALLOC_PLATFORM diff --git a/cmake/toolchains/CrayKNLStatic.cmake b/cmake/toolchains/CrayKNLStatic.cmake index 97843059eaa7..76e6160ba239 100644 --- a/cmake/toolchains/CrayKNLStatic.cmake +++ b/cmake/toolchains/CrayKNLStatic.cmake @@ -52,27 +52,6 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED CACHE BOOL "" ) -set(HPX_WITH_PARCELPORT_LIBFABRIC - ON - CACHE BOOL "" -) -set(HPX_PARCELPORT_LIBFABRIC_PROVIDER - "gni" - CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc" -) -set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS - "256" - CACHE STRING "Max number of messages in flight at once" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE - OFF - CACHE BOOL "Custom libfabric logging flag" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING - OFF - CACHE BOOL "Libfabric parcelport logging on/off flag" -) - # Set the TBBMALLOC_PLATFORM correctly so that find_package(TBBMalloc) sets the # right hints set(TBBMALLOC_PLATFORM diff --git a/cmake/toolchains/CrayStatic.cmake b/cmake/toolchains/CrayStatic.cmake index f89757a2e72c..6d1bc2061085 100644 --- a/cmake/toolchains/CrayStatic.cmake +++ b/cmake/toolchains/CrayStatic.cmake @@ -62,24 +62,3 @@ set(HPX_WITH_PARCELPORT_MPI_MULTITHREADED ON CACHE BOOL "" ) - -set(HPX_WITH_PARCELPORT_LIBFABRIC - ON - CACHE BOOL "" -) -set(HPX_PARCELPORT_LIBFABRIC_PROVIDER - "gni" - CACHE STRING "See libfabric docs for details, gni,verbs,psm2 etc etc" -) -set(HPX_PARCELPORT_LIBFABRIC_THROTTLE_SENDS - "256" - CACHE STRING "Max number of messages in flight at once" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_DEV_MODE - OFF - CACHE BOOL "Custom libfabric logging flag" -) -set(HPX_PARCELPORT_LIBFABRIC_WITH_LOGGING - OFF - CACHE BOOL "Libfabric parcelport logging on/off flag" -) diff --git a/libs/core/algorithms/CMakeLists.txt b/libs/core/algorithms/CMakeLists.txt index 6fcfed897e2f..9090345722df 100644 --- a/libs/core/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/CMakeLists.txt @@ -37,7 +37,9 @@ set(algorithms_headers hpx/parallel/algorithms/detail/parallel_stable_sort.hpp hpx/parallel/algorithms/detail/pivot.hpp hpx/parallel/algorithms/detail/reduce.hpp + hpx/parallel/algorithms/detail/reduce_deterministic.hpp hpx/parallel/algorithms/detail/replace.hpp + hpx/parallel/algorithms/detail/rfa.hpp hpx/parallel/algorithms/detail/rotate.hpp hpx/parallel/algorithms/detail/sample_sort.hpp hpx/parallel/algorithms/detail/search.hpp @@ -72,6 +74,7 @@ set(algorithms_headers hpx/parallel/algorithms/partition.hpp hpx/parallel/algorithms/reduce_by_key.hpp hpx/parallel/algorithms/reduce.hpp + hpx/parallel/algorithms/reduce_deterministic.hpp hpx/parallel/algorithms/remove_copy.hpp hpx/parallel/algorithms/remove.hpp hpx/parallel/algorithms/replace.hpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index b2de030eed8b..b37730889172 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -11,9 +11,9 @@ #include #include #include -#include #include +#include #include #include #include @@ -33,9 +33,11 @@ namespace hpx::parallel::detail { sequential_reduce_deterministic_t, ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r) { + /// TODO: Put constraint on Reduce to be a binary plus operator + (void) r; hpx::parallel::detail::rfa::RFA_bins bins; bins.initialize_bins(); - std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; rfa.set_max_abs_val(init); @@ -63,84 +65,6 @@ namespace hpx::parallel::detail { } }; - template - struct sequential_reduce_deterministic_rfa_t final - : hpx::functional::detail::tag_fallback< - sequential_reduce_deterministic_rfa_t> - { - private: - template - friend constexpr hpx::parallel::detail::rfa:: - ReproducibleFloatingAccumulator - tag_fallback_invoke(sequential_reduce_deterministic_rfa_t, - ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r) - { - hpx::parallel::detail::rfa::RFA_bins bins; - bins.initialize_bins(); - std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); - - hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; - - for (auto e = first; e != last; ++e) - { - rfa += *e; - } - return rfa; - } - - template - friend constexpr hpx::parallel::detail::rfa:: - ReproducibleFloatingAccumulator - tag_fallback_invoke(sequential_reduce_deterministic_rfa_t, - ExPolicy&&, InIterB first, std::size_t size, T init, Reduce&& r) - { - hpx::parallel::detail::rfa::RFA_bins bins; - bins.initialize_bins(); - std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); - - hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; - auto e = first; - for (std::size_t i = 0; i < size; ++i, ++e) - { - rfa += *e; - } - return rfa; - } - - // template , - // // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator< - // // double>>::value> - // > - // friend constexpr T tag_fallback_invoke( - // sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first, - // InIterE last, T init, Reduce&& r) - // { - // static_assert(hpx::util::contains, - // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator< - // double>>::value); - // hpx::parallel::detail::rfa::RFA_bins bins; - // bins.initialize_bins(); - // std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); - - // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; - // rfa.set_max_abs_val(init); - // rfa.unsafe_add(init); - // rfa.renorm(); - // for (auto e = first; e != last; ++e) - // { - // rfa += *e; - // } - // return rfa.conv(); - // } - }; - #if !defined(HPX_COMPUTE_DEVICE_CODE) template inline constexpr sequential_reduce_deterministic_t @@ -156,18 +80,4 @@ namespace hpx::parallel::detail { } #endif -#if !defined(HPX_COMPUTE_DEVICE_CODE) - template - inline constexpr sequential_reduce_deterministic_rfa_t - sequential_reduce_deterministic_rfa = - sequential_reduce_deterministic_rfa_t{}; -#else - template - HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic_rfa( - Args&&... args) - { - return sequential_reduce_deterministic_rfa_t{}( - std::forward(args)...); - } -#endif } // namespace hpx::parallel::detail diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp index 4a9910cb6faa..fa9142cdf80b 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -1,3 +1,34 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// --------------------------------------------------------------------------- +// This file has been taken from +// https://github.com/maddyscientist/reproducible_floating_sums commit +// b5a065741d4ea459437ca004b508de9dcb6a3e52. The boost copyright has been added +// to this file in accordance with the dual license terms for the Reproducible +// Floating-Point Summations and conformance with the HPX policy +// https://github.com/maddyscientist/reproducible_floating_sums/blob/feature/cuda/LICENSE.md +// --------------------------------------------------------------------------- +// +/// Copyright 2022 Richard Barnes, Peter Ahrens, James Demmel +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// The above copyright notice and this permission notice shall be included in +/// all copies or substantial portions of the Software. +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. //Reproducible Floating Point Accumulations via Binned Floating Point //Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. //ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. @@ -26,6 +57,10 @@ #include #include #include +#include +#include + +#include namespace hpx::parallel::detail::rfa { template @@ -163,7 +198,7 @@ namespace hpx::parallel::detail::rfa { } }; - static char bin_host_buffer[sizeof(RFA_bins)]; + static char __rfa_bin_host_buffer__[sizeof(RFA_bins)]; ///Class to hold a reproducible summation of the numbers passed to it /// @@ -179,7 +214,7 @@ namespace hpx::parallel::detail::rfa { static constexpr int FOLD = FOLD_; private: - std::array data = {0}; + std::array data = {{0}}; ///Floating-point precision bin width static constexpr auto BIN_WIDTH = @@ -214,7 +249,8 @@ namespace hpx::parallel::detail::rfa { ///Return a binned floating-point reference bin inline const ftype* binned_bins(const int x) const { - return &reinterpret_cast&>(bin_host_buffer)[x]; + return &reinterpret_cast&>( + __rfa_bin_host_buffer__)[x]; } ///Get the bit representation of a float @@ -350,21 +386,21 @@ namespace hpx::parallel::detail::rfa { ///Get index of float-point precision ///The index of a non-binned type is the smallest index a binned type would - ///need to have to sum it reproducibly. Higher indicies correspond to smaller + ///need to have to sum it reproducibly. Higher indices correspond to smaller ///bins. static inline constexpr int binned_dindex(const ftype x) { int exp = EXP(x); if (exp == 0) { - if (x == 0.0) + if (x == static_cast(0.0)) { return MAXINDEX; } else { std::frexp(x, &exp); - return std::max((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + return (std::max)((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); } } return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; @@ -372,7 +408,7 @@ namespace hpx::parallel::detail::rfa { ///Get index of manually specified binned double precision ///The index of a binned type is the bin that it corresponds to. Higher - ///indicies correspond to smaller bins. + ///indices correspond to smaller bins. inline int binned_index() const { return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - @@ -415,7 +451,9 @@ namespace hpx::parallel::detail::rfa { int shift = binned_index() - X_index; if (shift > 0) { -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = FOLD - 1; i >= 1; i--) { if (i < shift) @@ -424,7 +462,9 @@ namespace hpx::parallel::detail::rfa { carry(i * inccarY) = carry((i - shift) * inccarY); } const ftype* const bins = binned_bins(X_index); -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int j = 0; j < FOLD; j++) { if (j >= shift) @@ -456,16 +496,19 @@ namespace hpx::parallel::detail::rfa { if (binned_index0()) { M = primary(0); - ftype qd = x * COMPRESSION; + ftype qd = x * static_cast(COMPRESSION); auto& ql = get_bits(qd); ql |= 1; qd += M; primary(0) = qd; M -= qd; - M *= EXPANSION * 0.5; + auto temp_m = (double) (((double) EXPANSION) * 0.5); + M *= static_cast(temp_m); x += M; x += M; -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 1; i < FOLD - 1; i++) { M = primary(i * incpriY); @@ -484,7 +527,9 @@ namespace hpx::parallel::detail::rfa { { ftype qd = x; auto& ql = get_bits(qd); -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD - 1; i++) { M = primary(i * incpriY); @@ -549,7 +594,7 @@ namespace hpx::parallel::detail::rfa { int i = 0; if (ISNANINF(primary(0))) - return primary(0); + return (double) primary(0); if (ISZERO(primary(0))) return 0.0; @@ -563,29 +608,36 @@ namespace hpx::parallel::detail::rfa { { scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); - scaled = std::max( - std::min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); + scaled = (std::max)( + (std::min)(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); if (X_index == 0) { - Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); - Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); - Y += (primary(0) - bins[0]) * scale_down * EXPANSION; + Y += ((double) carry(0)) * + ((((double) bins[0]) / 6.0) * scale_down * EXPANSION); + Y += ((double) carry(inccarX)) * + ((((double) bins[1]) / 6.0) * scale_down); + Y += ((double) primary(0) - (double) bins[0]) * scale_down * + EXPANSION; i = 2; } else { - Y += carry(0) * ((bins[0] / 6.0) * scale_down); + Y += ((double) carry(0)) * + (((double) bins[0] / 6.0) * scale_down); i = 1; } for (; i < scaled; i++) { - Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); - Y += - (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; + Y += ((double) carry(i * inccarX)) * + (((double) bins[i] / 6.0) * scale_down); + Y += ((double) primary((i - 1) * incpriX) - + (double) (bins[i - 1])) * + scale_down; } if (i == FOLD) { - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * + Y += ((double) primary((FOLD - 1) * incpriX) - + (double) (bins[FOLD - 1])) * scale_down; return Y * scale_up; } @@ -596,20 +648,23 @@ namespace hpx::parallel::detail::rfa { Y *= scale_up; for (; i < FOLD; i++) { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += primary((i - 1) * incpriX) - bins[i - 1]; + Y += ((double) carry(i * inccarX)) * + ((double) bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } - Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; + Y += ((double) primary((FOLD - 1) * incpriX) - + ((double) bins[FOLD - 1])); } else { - Y += carry(0) * (bins[0] / 6.0); + Y += ((double) carry(0)) * ((double) bins[0] / 6.0); for (i = 1; i < FOLD; i++) { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += (primary((i - 1) * incpriX) - bins[i - 1]); + Y += ((double) carry(i * inccarX)) * + ((double) bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); } return Y; } @@ -626,7 +681,7 @@ namespace hpx::parallel::detail::rfa { if (ISNANINF(primary(0))) return primary(0); if (ISZERO(primary(0))) - return 0.0; + return 0.0f; //Note that the following order of summation is in order of decreasing //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and @@ -635,20 +690,22 @@ namespace hpx::parallel::detail::rfa { const auto* const bins = binned_bins(X_index); if (X_index == 0) { - Y += (double) carry(0) * (double) (bins[0] / 6.0) * + Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0) * (double) EXPANSION; - Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); + Y += (double) carry(inccarX) * + (double) (((double) bins[1]) / 6.0); Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; i = 2; } else { - Y += (double) carry(0) * (double) (bins[0] / 6.0); + Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0); i = 1; } for (; i < FOLD; i++) { - Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); + Y += (double) carry(i * inccarX) * + (double) (((double) bins[i]) / 6.0); Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); } Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); @@ -693,8 +750,10 @@ namespace hpx::parallel::detail::rfa { if (shift > 0) { const auto* const bins = binned_bins(Y_index); - //shift Y upwards and add X to Y -#pragma unroll +//shift Y upwards and add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = FOLD - 1; i >= 1; i--) { if (i < shift) @@ -704,7 +763,9 @@ namespace hpx::parallel::detail::rfa { carry(i * inccarY) = x.carry(i * inccarX) + carry((i - shift) * inccarY); } -#pragma unroll +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { if (i == shift) @@ -716,8 +777,10 @@ namespace hpx::parallel::detail::rfa { else if (shift < 0) { const auto* const bins = binned_bins(X_index); - //shift X upwards and add X to Y -#pragma unroll +//shift X upwards and add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { if (i < -shift) @@ -730,8 +793,10 @@ namespace hpx::parallel::detail::rfa { else if (shift == 0) { const auto* const bins = binned_bins(X_index); - // add X to Y -#pragma unroll +// add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif for (int i = 0; i < FOLD; i++) { primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; @@ -770,7 +835,7 @@ namespace hpx::parallel::detail::rfa { } ///Return the endurance of the binned fp - constexpr int endurance() const + constexpr size_t endurance() const { return ENDURANCE; } @@ -866,11 +931,11 @@ namespace hpx::parallel::detail::rfa { { if (std::is_same_v) { - return binned_conv_single(1, 1); + return static_cast(binned_conv_single(1, 1)); } else { - return binned_conv_double(1, 1); + return static_cast(binned_conv_double(1, 1)); } } @@ -887,7 +952,8 @@ namespace hpx::parallel::detail::rfa { { const double X = std::abs(max_abs_val); const double S = std::abs(binned_sum); - return static_cast(max(X, std::ldexp(0.5, MIN_EXP - 1)) * + return static_cast( + (std::max)(X, std::ldexp(0.5, MIN_EXP - 1)) * std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + ((7.0 * EPSILON) / (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - @@ -972,7 +1038,7 @@ namespace hpx::parallel::detail::rfa { T max_abs_val = input[0]; for (size_t i = 0; i < N; i++) { - max_abs_val = max(max_abs_val, std::abs(input[i])); + max_abs_val = (std::max)(max_abs_val, std::abs(input[i])); } add(input, N, max_abs_val); } @@ -1141,4 +1207,4 @@ namespace hpx::parallel::detail::rfa { } }; -} // namespace hpx::parallel::detail::rfa \ No newline at end of file +} // namespace hpx::parallel::detail::rfa diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp index b8435eec9a02..996865d519c5 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -10,7 +10,6 @@ #pragma once -#include "detail/reduce_deterministic.hpp" #if defined(DOXYGEN) namespace hpx { @@ -420,7 +419,7 @@ namespace hpx::parallel { ReproducibleFloatingAccumulator { T val = *part_begin; return hpx::parallel::detail:: - sequential_reduce_deterministic_rfa( + sequential_reduce_deterministic( HPX_FORWARD(ExPolicy, policy), ++part_begin, --part_size, HPX_MOVE(val), r); }; @@ -433,7 +432,7 @@ namespace hpx::parallel { r = HPX_FORWARD(Reduce, r), policy](auto&& results) -> T { return hpx::parallel::detail:: - sequential_reduce_deterministic_rfa( + sequential_reduce_deterministic( HPX_FORWARD(ExPolicy, policy), hpx::util::begin(results), hpx::util::size(results), init, r) diff --git a/libs/core/algorithms/tests/performance/CMakeLists.txt b/libs/core/algorithms/tests/performance/CMakeLists.txt index d74788a9b47f..96ce826dc742 100644 --- a/libs/core/algorithms/tests/performance/CMakeLists.txt +++ b/libs/core/algorithms/tests/performance/CMakeLists.txt @@ -16,6 +16,7 @@ set(benchmarks benchmark_partial_sort_parallel benchmark_partition benchmark_partition_copy + benchmark_reduce_deterministic benchmark_remove benchmark_remove_if benchmark_scan_algorithms diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp new file mode 100644 index 000000000000..daaee2b1269b --- /dev/null +++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp @@ -0,0 +1,148 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include + +#if !defined(HPX_COMPUTE_DEVICE_CODE) +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +int seed = 1000; +std::mt19937 gen(seed); + +template +T get_rand(T LO = (std::numeric_limits::min)(), + T HI = (std::numeric_limits::max)()) +{ + return LO + + static_cast(std::rand()) / + (static_cast(static_cast((RAND_MAX)) / (HI - LO))); +} + +/////////////////////////////////////////////////////////////////////////////// + +void bench_reduce_deterministic( + const auto& deterministic_shuffled, const auto& val_det, const auto& op) +{ + // check if different type for deterministic and nondeeterministic + // and same result + + auto r1_shuffled = + hpx::reduce_deterministic((std::begin(deterministic_shuffled)), + (std::end(deterministic_shuffled)), val_det, op); + + HPX_UNUSED(r1_shuffled); +} + +void bench_reduce(const auto& policy, const auto& deterministic_shuffled, + const auto& val_det, const auto& op) +{ + auto r = hpx::reduce(policy, (std::begin(deterministic_shuffled)), + (std::end(deterministic_shuffled)), val_det, op); + + HPX_UNUSED(r); +} + +////////////////////////////////////////////////////////////////////////////// +int hpx_main(hpx::program_options::variables_map& vm) +{ + std::srand(seed); + + auto test_count = vm["test_count"].as(); + + hpx::util::perftests_init(vm); + + // verify that input is within domain of program + if (test_count == 0 || test_count < 0) + { + std::cerr << "test_count cannot be zero or negative...\n" << std::flush; + hpx::local::finalize(); + return -1; + } + + { + using FloatTypeDeterministic = float; + std::size_t LEN = 10000; + + constexpr FloatTypeDeterministic num_bounds_det = + std::is_same_v ? 1000.0 : 1000000.0; + + std::vector deterministic(LEN); + + for (size_t i = 0; i < LEN; ++i) + { + deterministic[i] = get_rand( + -num_bounds_det, num_bounds_det); + } + + std::vector deterministic_shuffled = + deterministic; + + std::shuffle( + deterministic_shuffled.begin(), deterministic_shuffled.end(), gen); + + FloatTypeDeterministic val_det(41.999); + + auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) { + return v1 + v2; + }; + { + hpx::util::perftests_report("reduce", "seq", test_count, [&]() { + bench_reduce( + hpx::execution::seq, deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report("reduce", "par", test_count, [&]() { + bench_reduce( + hpx::execution::par, deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report( + "reduce deterministic", "seq", test_count, [&]() { + bench_reduce_deterministic( + deterministic_shuffled, val_det, op); + }); + } + + hpx::util::perftests_print_times(); + } + + return hpx::local::finalize(); +} + +/////////////////////////////////////////////////////////////////////////////// +int main(int argc, char* argv[]) +{ + using namespace hpx::program_options; + + options_description cmdline("usage: " HPX_APPLICATION_STRING " [options]"); + + // clang-format off + cmdline.add_options() + ("test_count", value()->default_value(100), + "number of tests to be averaged") + ; + // clang-format on + + hpx::util::perftests_cfg(cmdline); + hpx::local::init_params init_args; + init_args.desc_cmdline = cmdline; + init_args.cfg = {"hpx.os_threads=all"}; + + return hpx::local::init(hpx_main, argc, argv, init_args); +} +#endif diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt index 76dc5fcd9806..559ee830030e 100644 --- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt @@ -246,7 +246,3 @@ foreach(test ${tests}) "modules.algorithms.algorithms" ${test} ${${test}_PARAMETERS} ) endforeach() - -target_compile_options(reduce_deterministic_test PRIVATE -fsanitize=address) - -target_link_options(reduce_deterministic_test PRIVATE -fsanitize=address) \ No newline at end of file diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index 694b50cfb76d..5a06c509efdc 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -4,8 +4,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -#pragma once - #include #include #include @@ -19,6 +17,7 @@ #include #include #include +#include #include #include "test_utils.hpp" @@ -27,11 +26,12 @@ int seed = std::random_device{}(); std::mt19937 gen(seed); template -T get_rand( - T LO = std::numeric_limits::min(), T HI = std::numeric_limits::max()) +T get_rand(T LO = (std::numeric_limits::min)(), + T HI = (std::numeric_limits::max)()) { return LO + - static_cast(std::rand()) / (static_cast(RAND_MAX / (HI - LO))); + static_cast(std::rand()) / + (static_cast(static_cast((RAND_MAX)) / (HI - LO))); } /////////////////////////////////////////////////////////////////////////////// @@ -42,10 +42,12 @@ void test_reduce1(IteratorTag) { // check if different type for deterministic and nondeeterministic // and same result i.e. correct computation - using base_iterator_det = std::vector::iterator; + using base_iterator_det = + typename std::vector::iterator; using iterator_det = test::test_iterator; - using base_iterator_ndet = std::vector::iterator; + using base_iterator_ndet = + typename std::vector::iterator; using iterator_ndet = test::test_iterator; std::vector deterministic(LEN); @@ -75,51 +77,8 @@ void test_reduce1(IteratorTag) FloatTypeNonDeterministic r3 = std::accumulate( nondeterministic.begin(), nondeterministic.end(), val_non_det); - HPX_TEST_EQ(r1, r3); - HPX_TEST_EQ(r2, r3); -} - -template -void test_reduce_parallel1(IteratorTag) -{ - // check if different type for deterministic and nondeeterministic - // and same result i.e. correct computation - using base_iterator_det = std::vector::iterator; - using iterator_det = test::test_iterator; - - using base_iterator_ndet = std::vector::iterator; - using iterator_ndet = test::test_iterator; - - std::vector deterministic(LEN); - std::vector nondeterministic(LEN); - - std::iota( - deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0)); - - std::iota(nondeterministic.begin(), nondeterministic.end(), - FloatTypeNonDeterministic(0)); - - FloatTypeDeterministic val_det(0); - FloatTypeNonDeterministic val_non_det(0); - auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { - return v1 + v2; - }; - - FloatTypeDeterministic r1 = hpx::reduce_deterministic(hpx::execution::par, - iterator_det(std::begin(deterministic)), - iterator_det(std::end(deterministic)), val_det, op); - - // verify values - // FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::par, - // iterator_ndet(std::begin(nondeterministic)), - // iterator_ndet(std::end(nondeterministic)), val_non_det, op); - - FloatTypeNonDeterministic r3 = std::accumulate( - nondeterministic.begin(), nondeterministic.end(), val_non_det); - - HPX_TEST_EQ(r1, r3); - // HPX_TEST_EQ(r2, r3); + HPX_TEST_EQ(static_cast(r1), r3); + HPX_TEST_EQ(static_cast(r2), r3); } template ::iterator; + using base_iterator_det = + typename std::vector::iterator; using iterator_det = test::test_iterator; - constexpr auto num_bounds_det = - std::is_same_v ? 1000.0f : 1000000.0f; + constexpr FloatTypeDeterministic num_bounds_det = + std::is_same_v ? 1000.0 : 1000000.0; std::vector deterministic(LEN); @@ -165,11 +125,15 @@ void test_reduce_determinism(IteratorTag) r1_shuffled); // Deterministically calculated, should always satisfy } +/// This test function is never called because it is not guaranteed to pass +/// It serves an important purpose to demonstrate that floating point summation +/// is not always associative i.e. a+b+c != a+c+b template void test_orig_reduce_determinism(IteratorTag) { - using base_iterator_ndet = std::vector::iterator; + using base_iterator_ndet = + typename std::vector::iterator; using iterator_ndet = test::test_iterator; constexpr auto num_bounds_ndet = @@ -221,7 +185,6 @@ void test_reduce1() test_reduce1(IteratorTag()); test_reduce1(IteratorTag()); test_reduce1(IteratorTag()); - test_reduce_parallel1(IteratorTag()); } template @@ -233,21 +196,22 @@ void test_reduce2() test_reduce_determinism(IteratorTag()); } -template -void test_reduce3() -{ - using namespace hpx::execution; +// template +// void test_reduce3() +// { +// using namespace hpx::execution; - test_orig_reduce_determinism(IteratorTag()); - test_orig_reduce_determinism(IteratorTag()); -} +// test_orig_reduce_determinism(IteratorTag()); +// test_orig_reduce_determinism(IteratorTag()); +// } void reduce_test1() { test_reduce1(); test_reduce2(); - test_reduce3(); - // test_reduce1(); + // test_reduce3(); + test_reduce1(); + test_reduce2(); } /////////////////////////////////////////////////////////////////////////////// diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp index d86fc5775415..b29652a3ede1 100644 --- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp +++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp @@ -25,7 +25,7 @@ void tagged_ptr_test() i = j; HPX_TEST_EQ(i.get_ptr(), &b); - HPX_TEST_EQ(i.get_tag(), 1UL); + HPX_TEST_EQ(i.get_tag(), 1); } { @@ -43,7 +43,7 @@ void tagged_ptr_test() { tagged_ptr j(&a, max_tag); - HPX_TEST_EQ(j.get_next_tag(), 0UL); + HPX_TEST_EQ(j.get_next_tag(), 0); } { diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp index 8a01d9574853..3d7cf5da2aa0 100644 --- a/libs/core/debugging/src/print.cpp +++ b/libs/core/debugging/src/print.cpp @@ -57,10 +57,6 @@ namespace hpx::debug { std::ostream&, std::int32_t const&, int); template HPX_CORE_EXPORT void print_dec( std::ostream&, std::int64_t const&, int); -#ifdef __APPLE__ - template HPX_CORE_EXPORT void print_dec( - std::ostream&, unsigned long const&, int); -#endif template HPX_CORE_EXPORT void print_dec( std::ostream&, std::uint64_t const&, int);