Skip to content

Commit b5c8df3

Browse files
committed
Add work aggregation to p2p kokkos kernel
1 parent dd5cb88 commit b5c8df3

File tree

6 files changed

+492
-134
lines changed

6 files changed

+492
-134
lines changed

octotiger/aggregation_util.hpp

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
2+
#pragma once
3+
#include <hpx/futures/future.hpp>
4+
#include <hpx/kokkos/executors.hpp>
5+
#ifdef OCTOTIGER_HAVE_KOKKOS
6+
//#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
7+
#include <Kokkos_Core.hpp>
8+
#include <hpx/kokkos.hpp>
9+
10+
#include <stream_manager.hpp>
11+
#include <aggregation_manager.hpp>
12+
13+
// ============================================================
14+
// Aggregation Helpers // TODO(daissgr) Move to cppuddle?
15+
// ============================================================
16+
//
17+
#ifdef __NVCC__
18+
#include <cuda/std/tuple>
19+
#if defined(HPX_CUDA_VERSION) && (HPX_CUDA_VERSION < 1202)
20+
// cuda::std::tuple structured bindings are broken in CUDA < 1202
21+
// See https://github.com/NVIDIA/libcudacxx/issues/316
22+
// According to https://github.com/NVIDIA/libcudacxx/pull/317 the fix for this
23+
// is to move tuple element and tuple size into the std namespace
24+
// which the following snippet does. This is only necessary for old CUDA versions
25+
// the newer ones contain a fix for this issue
26+
namespace std {
27+
template<size_t _Ip, class... _Tp>
28+
struct tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>>
29+
: _CUDA_VSTD::tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>> {};
30+
template <class... _Tp>
31+
struct tuple_size<_CUDA_VSTD::tuple<_Tp...>>
32+
: _CUDA_VSTD::tuple_size<_CUDA_VSTD::tuple<_Tp...>> {};
33+
}
34+
#endif
35+
#endif
36+
static const char hydro_kokkos_kernel_identifier[] = "hydro_kernel_aggregator_kokkos";
37+
template<typename executor_t>
38+
using hydro_kokkos_agg_executor_pool = aggregation_pool<hydro_kokkos_kernel_identifier, executor_t,
39+
round_robin_pool<executor_t>>;
40+
41+
template <typename Agg_view_t>
42+
CUDA_GLOBAL_METHOD typename Agg_view_t::view_type get_slice_subview(
43+
const size_t slice_id, const size_t max_slices, const Agg_view_t& agg_view) {
44+
const size_t slice_size = agg_view.size() / max_slices;
45+
return Kokkos::subview(agg_view,
46+
std::make_pair<size_t, size_t>(slice_id * slice_size, (slice_id + 1) * slice_size));
47+
}
48+
49+
template <typename Integer, std::enable_if_t<std::is_integral<Integer>::value, bool> = true,
50+
typename Agg_view_t, typename... Args>
51+
CUDA_GLOBAL_METHOD auto map_views_to_slice(const Integer slice_id, const Integer max_slices,
52+
const Agg_view_t& current_arg, const Args&... rest) {
53+
static_assert(
54+
Kokkos::is_view<typename Agg_view_t::view_type>::value, "Argument not an aggregated view");
55+
#if defined(HPX_COMPUTE_DEVICE_CODE) && defined(__NVCC__)
56+
if constexpr (sizeof...(Args) > 0) {
57+
return cuda::std::tuple_cat(cuda::std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
58+
map_views_to_slice(slice_id, max_slices, rest...));
59+
} else {
60+
return cuda::std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg));
61+
}
62+
#else
63+
if constexpr (sizeof...(Args) > 0) {
64+
return std::tuple_cat(std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
65+
map_views_to_slice(slice_id, max_slices, rest...));
66+
} else {
67+
return std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg));
68+
}
69+
#endif
70+
}
71+
72+
template <typename Agg_executor_t, typename Agg_view_t, std::enable_if_t<Kokkos::is_view<typename Agg_view_t::view_type>::value, bool> = true, typename... Args>
73+
CUDA_GLOBAL_METHOD auto map_views_to_slice(const Agg_executor_t& agg_exec, const Agg_view_t& current_arg,
74+
const Args&... rest) {
75+
const size_t slice_id = agg_exec.id;
76+
const size_t max_slices = opts().max_kernels_fused;
77+
static_assert(
78+
Kokkos::is_view<typename Agg_view_t::view_type>::value, "Argument not an aggregated view");
79+
if constexpr (sizeof...(Args) > 0) {
80+
return std::tuple_cat(std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
81+
map_views_to_slice(agg_exec, rest...));
82+
} else {
83+
return std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg));
84+
}
85+
}
86+
87+
template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
88+
void aggregated_deep_copy(Agg_executor_t& agg_exec, TargetView_t& target, SourceView_t& source) {
89+
if (agg_exec.sync_aggregation_slices()) {
90+
Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(), target, source);
91+
}
92+
}
93+
94+
template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
95+
void aggregated_deep_copy(
96+
Agg_executor_t& agg_exec, TargetView_t& target, SourceView_t& source, int elements_per_slice) {
97+
if (agg_exec.sync_aggregation_slices()) {
98+
const size_t number_slices = agg_exec.number_slices;
99+
auto target_slices = Kokkos::subview(
100+
target, std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
101+
auto source_slices = Kokkos::subview(
102+
source, std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
103+
Kokkos::deep_copy(
104+
agg_exec.get_underlying_executor().instance(), target_slices, source_slices);
105+
}
106+
}
107+
108+
template <typename executor_t, typename TargetView_t, typename SourceView_t>
109+
hpx::shared_future<void> aggregrated_deep_copy_async(
110+
typename Aggregated_Executor<executor_t>::Executor_Slice& agg_exec, TargetView_t& target,
111+
SourceView_t& source) {
112+
const size_t gpu_id = agg_exec.parent.gpu_id;
113+
auto launch_copy_lambda = [gpu_id](TargetView_t& target, SourceView_t& source,
114+
executor_t& exec) -> hpx::shared_future<void> {
115+
stream_pool::select_device<executor_t,
116+
round_robin_pool<executor_t>>(gpu_id);
117+
return hpx::kokkos::deep_copy_async(exec.instance(), target, source);
118+
};
119+
return agg_exec.wrap_async(
120+
launch_copy_lambda, target, source, agg_exec.get_underlying_executor());
121+
}
122+
123+
template <typename executor_t, typename TargetView_t, typename SourceView_t>
124+
hpx::shared_future<void> aggregrated_deep_copy_async(
125+
typename Aggregated_Executor<executor_t>::Executor_Slice& agg_exec, TargetView_t& target,
126+
SourceView_t& source, int elements_per_slice) {
127+
const size_t number_slices = agg_exec.number_slices;
128+
const size_t gpu_id = agg_exec.parent.gpu_id;
129+
auto launch_copy_lambda = [gpu_id, elements_per_slice, number_slices](TargetView_t& target,
130+
SourceView_t& source,
131+
executor_t& exec) -> hpx::shared_future<void> {
132+
stream_pool::select_device<executor_t,
133+
round_robin_pool<executor_t>>(gpu_id);
134+
auto target_slices = Kokkos::subview(
135+
target, std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
136+
auto source_slices = Kokkos::subview(
137+
source, std::make_pair<size_t, size_t>(0, number_slices *
138+
elements_per_slice));
139+
return hpx::kokkos::deep_copy_async(exec.instance(), target_slices, source_slices);
140+
};
141+
return agg_exec.wrap_async(
142+
launch_copy_lambda, target, source, agg_exec.get_underlying_executor());
143+
}
144+
#endif

octotiger/common_kernel/kokkos_util.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,4 +201,5 @@ using normal_host_buffer = kokkos_host_array<T>;
201201
template <typename T>
202202
using normal_device_buffer = kokkos_device_array<T>;
203203

204+
204205
#endif

0 commit comments

Comments
 (0)