|
| 1 | + |
| 2 | +#pragma once |
| 3 | +#include <hpx/futures/future.hpp> |
| 4 | +#include <hpx/kokkos/executors.hpp> |
| 5 | +#ifdef OCTOTIGER_HAVE_KOKKOS |
| 6 | +//#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION |
| 7 | +#include <Kokkos_Core.hpp> |
| 8 | +#include <hpx/kokkos.hpp> |
| 9 | + |
| 10 | +#include <stream_manager.hpp> |
| 11 | +#include <aggregation_manager.hpp> |
| 12 | + |
| 13 | +// ============================================================ |
| 14 | +// Aggregation Helpers // TODO(daissgr) Move to cppuddle? |
| 15 | +// ============================================================ |
| 16 | +// |
| 17 | +#ifdef __NVCC__ |
| 18 | +#include <cuda/std/tuple> |
| 19 | +#if defined(HPX_CUDA_VERSION) && (HPX_CUDA_VERSION < 1202) |
| 20 | +// cuda::std::tuple structured bindings are broken in CUDA < 1202 |
| 21 | +// See https://github.com/NVIDIA/libcudacxx/issues/316 |
| 22 | +// According to https://github.com/NVIDIA/libcudacxx/pull/317 the fix for this |
| 23 | +// is to move tuple element and tuple size into the std namespace |
| 24 | +// which the following snippet does. This is only necessary for old CUDA versions |
| 25 | +// the newer ones contain a fix for this issue |
| 26 | +namespace std { |
| 27 | + template<size_t _Ip, class... _Tp> |
| 28 | + struct tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>> |
| 29 | + : _CUDA_VSTD::tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>> {}; |
| 30 | + template <class... _Tp> |
| 31 | + struct tuple_size<_CUDA_VSTD::tuple<_Tp...>> |
| 32 | + : _CUDA_VSTD::tuple_size<_CUDA_VSTD::tuple<_Tp...>> {}; |
| 33 | +} |
| 34 | +#endif |
| 35 | +#endif |
| 36 | +static const char hydro_kokkos_kernel_identifier[] = "hydro_kernel_aggregator_kokkos"; |
| 37 | +template<typename executor_t> |
| 38 | +using hydro_kokkos_agg_executor_pool = aggregation_pool<hydro_kokkos_kernel_identifier, executor_t, |
| 39 | + round_robin_pool<executor_t>>; |
| 40 | + |
| 41 | +template <typename Agg_view_t> |
| 42 | +CUDA_GLOBAL_METHOD typename Agg_view_t::view_type get_slice_subview( |
| 43 | + const size_t slice_id, const size_t max_slices, const Agg_view_t& agg_view) { |
| 44 | + const size_t slice_size = agg_view.size() / max_slices; |
| 45 | + return Kokkos::subview(agg_view, |
| 46 | + std::make_pair<size_t, size_t>(slice_id * slice_size, (slice_id + 1) * slice_size)); |
| 47 | +} |
| 48 | + |
| 49 | +template <typename Integer, std::enable_if_t<std::is_integral<Integer>::value, bool> = true, |
| 50 | + typename Agg_view_t, typename... Args> |
| 51 | +CUDA_GLOBAL_METHOD auto map_views_to_slice(const Integer slice_id, const Integer max_slices, |
| 52 | + const Agg_view_t& current_arg, const Args&... rest) { |
| 53 | + static_assert( |
| 54 | + Kokkos::is_view<typename Agg_view_t::view_type>::value, "Argument not an aggregated view"); |
| 55 | +#if defined(HPX_COMPUTE_DEVICE_CODE) && defined(__NVCC__) |
| 56 | + if constexpr (sizeof...(Args) > 0) { |
| 57 | + return cuda::std::tuple_cat(cuda::std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)), |
| 58 | + map_views_to_slice(slice_id, max_slices, rest...)); |
| 59 | + } else { |
| 60 | + return cuda::std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)); |
| 61 | + } |
| 62 | +#else |
| 63 | + if constexpr (sizeof...(Args) > 0) { |
| 64 | + return std::tuple_cat(std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)), |
| 65 | + map_views_to_slice(slice_id, max_slices, rest...)); |
| 66 | + } else { |
| 67 | + return std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)); |
| 68 | + } |
| 69 | +#endif |
| 70 | +} |
| 71 | + |
| 72 | +template <typename Agg_executor_t, typename Agg_view_t, std::enable_if_t<Kokkos::is_view<typename Agg_view_t::view_type>::value, bool> = true, typename... Args> |
| 73 | +CUDA_GLOBAL_METHOD auto map_views_to_slice(const Agg_executor_t& agg_exec, const Agg_view_t& current_arg, |
| 74 | + const Args&... rest) { |
| 75 | + const size_t slice_id = agg_exec.id; |
| 76 | + const size_t max_slices = opts().max_kernels_fused; |
| 77 | + static_assert( |
| 78 | + Kokkos::is_view<typename Agg_view_t::view_type>::value, "Argument not an aggregated view"); |
| 79 | + if constexpr (sizeof...(Args) > 0) { |
| 80 | + return std::tuple_cat(std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)), |
| 81 | + map_views_to_slice(agg_exec, rest...)); |
| 82 | + } else { |
| 83 | + return std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)); |
| 84 | + } |
| 85 | +} |
| 86 | + |
| 87 | +template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t> |
| 88 | +void aggregated_deep_copy(Agg_executor_t& agg_exec, TargetView_t& target, SourceView_t& source) { |
| 89 | + if (agg_exec.sync_aggregation_slices()) { |
| 90 | + Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(), target, source); |
| 91 | + } |
| 92 | +} |
| 93 | + |
| 94 | +template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t> |
| 95 | +void aggregated_deep_copy( |
| 96 | + Agg_executor_t& agg_exec, TargetView_t& target, SourceView_t& source, int elements_per_slice) { |
| 97 | + if (agg_exec.sync_aggregation_slices()) { |
| 98 | + const size_t number_slices = agg_exec.number_slices; |
| 99 | + auto target_slices = Kokkos::subview( |
| 100 | + target, std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice)); |
| 101 | + auto source_slices = Kokkos::subview( |
| 102 | + source, std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice)); |
| 103 | + Kokkos::deep_copy( |
| 104 | + agg_exec.get_underlying_executor().instance(), target_slices, source_slices); |
| 105 | + } |
| 106 | +} |
| 107 | + |
| 108 | +template <typename executor_t, typename TargetView_t, typename SourceView_t> |
| 109 | +hpx::shared_future<void> aggregrated_deep_copy_async( |
| 110 | + typename Aggregated_Executor<executor_t>::Executor_Slice& agg_exec, TargetView_t& target, |
| 111 | + SourceView_t& source) { |
| 112 | + const size_t gpu_id = agg_exec.parent.gpu_id; |
| 113 | + auto launch_copy_lambda = [gpu_id](TargetView_t& target, SourceView_t& source, |
| 114 | + executor_t& exec) -> hpx::shared_future<void> { |
| 115 | + stream_pool::select_device<executor_t, |
| 116 | + round_robin_pool<executor_t>>(gpu_id); |
| 117 | + return hpx::kokkos::deep_copy_async(exec.instance(), target, source); |
| 118 | + }; |
| 119 | + return agg_exec.wrap_async( |
| 120 | + launch_copy_lambda, target, source, agg_exec.get_underlying_executor()); |
| 121 | +} |
| 122 | + |
| 123 | +template <typename executor_t, typename TargetView_t, typename SourceView_t> |
| 124 | +hpx::shared_future<void> aggregrated_deep_copy_async( |
| 125 | + typename Aggregated_Executor<executor_t>::Executor_Slice& agg_exec, TargetView_t& target, |
| 126 | + SourceView_t& source, int elements_per_slice) { |
| 127 | + const size_t number_slices = agg_exec.number_slices; |
| 128 | + const size_t gpu_id = agg_exec.parent.gpu_id; |
| 129 | + auto launch_copy_lambda = [gpu_id, elements_per_slice, number_slices](TargetView_t& target, |
| 130 | + SourceView_t& source, |
| 131 | + executor_t& exec) -> hpx::shared_future<void> { |
| 132 | + stream_pool::select_device<executor_t, |
| 133 | + round_robin_pool<executor_t>>(gpu_id); |
| 134 | + auto target_slices = Kokkos::subview( |
| 135 | + target, std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice)); |
| 136 | + auto source_slices = Kokkos::subview( |
| 137 | + source, std::make_pair<size_t, size_t>(0, number_slices * |
| 138 | + elements_per_slice)); |
| 139 | + return hpx::kokkos::deep_copy_async(exec.instance(), target_slices, source_slices); |
| 140 | + }; |
| 141 | + return agg_exec.wrap_async( |
| 142 | + launch_copy_lambda, target, source, agg_exec.get_underlying_executor()); |
| 143 | +} |
| 144 | +#endif |
0 commit comments