Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move shared CCL variables into single struct #665

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions device/alpaka/src/clusterization/clusterization_algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,8 @@ struct CCLKernel {

traccc::alpaka::thread_id1 thread_id(acc);

auto& partition_start =
::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
auto& partition_end =
::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
auto& outi = ::alpaka::declareSharedVar<std::size_t, __COUNTER__>(acc);
auto& shared = ::alpaka::declareSharedVar<
device::details::ccl_kernel_static_smem_parcel, __COUNTER__>(acc);

device::details::index_t* const shared_v =
::alpaka::getDynSharedMem<device::details::index_t>(acc);
Expand All @@ -56,9 +53,8 @@ struct CCLKernel {

alpaka::barrier<TAcc> barry_r(&acc);

device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
partition_start, partition_end, outi, f_view,
gf_view, f_backup_view, gf_backup_view,
device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared,
f_view, gf_view, f_backup_view, gf_backup_view,
adjc_backup_view, adjv_backup_view, backup_mutex,
barry_r, measurements_view, cell_links);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@
#include <cstddef>

namespace traccc::device {
namespace details {
struct ccl_kernel_static_smem_parcel {
std::size_t partition_start;
std::size_t partition_end;
uint32_t outi;
};
} // namespace details

/// Function which reads raw detector cells and turns them into measurements.
///
Expand Down Expand Up @@ -59,7 +66,7 @@ TRACCC_DEVICE inline void ccl_kernel(
const clustering_config cfg, const thread_id_t& thread_id,
const cell_collection_types::const_view cells_view,
const cell_module_collection_types::const_view modules_view,
std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi,
details::ccl_kernel_static_smem_parcel& smem,
vecmem::data::vector_view<details::index_t> f_view,
vecmem::data::vector_view<details::index_t> gf_view,
vecmem::data::vector_view<details::index_t> f_backup_view,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,15 @@ TRACCC_DEVICE void fast_sv_1(const thread_id_t& thread_id,
template <device::concepts::barrier barrier_t,
device::concepts::thread_id1 thread_id_t>
TRACCC_DEVICE inline void ccl_core(
const thread_id_t& thread_id, std::size_t& partition_start,
std::size_t& partition_end, vecmem::device_vector<details::index_t> f,
const thread_id_t& thread_id, details::ccl_kernel_static_smem_parcel& smem,
vecmem::device_vector<details::index_t> f,
vecmem::device_vector<details::index_t> gf,
vecmem::data::vector_view<unsigned int> cell_links, details::index_t* adjv,
unsigned char* adjc, const cell_collection_types::const_device cells_device,
const cell_module_collection_types::const_device modules_device,
measurement_collection_types::device measurements_device,
barrier_t& barrier) {
const details::index_t size = partition_end - partition_start;
const details::index_t size = smem.partition_end - smem.partition_start;

assert(size <= f.size());
assert(size <= gf.size());
Expand All @@ -160,8 +160,8 @@ TRACCC_DEVICE inline void ccl_core(
const details::index_t cid =
tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX();
adjc[tst] = 0;
reduce_problem_cell(cells_device, cid, partition_start, partition_end,
adjc[tst], &adjv[8 * tst]);
reduce_problem_cell(cells_device, cid, smem.partition_start,
smem.partition_end, adjc[tst], &adjv[8 * tst]);
}

for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
Expand Down Expand Up @@ -198,9 +198,10 @@ TRACCC_DEVICE inline void ccl_core(
const measurement_collection_types::device::size_type meas_pos =
measurements_device.push_back({});
// Set up the measurement under the appropriate index.
aggregate_cluster(
cells_device, modules_device, f, partition_start, partition_end,
cid, measurements_device.at(meas_pos), cell_links, meas_pos);
aggregate_cluster(cells_device, modules_device, f,
smem.partition_start, smem.partition_end, cid,
measurements_device.at(meas_pos), cell_links,
meas_pos);
}
}
}
Expand All @@ -211,7 +212,7 @@ TRACCC_DEVICE inline void ccl_kernel(
const clustering_config cfg, const thread_id_t& thread_id,
const cell_collection_types::const_view cells_view,
const cell_module_collection_types::const_view modules_view,
std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi,
details::ccl_kernel_static_smem_parcel& smem,
vecmem::data::vector_view<details::index_t> f_view,
vecmem::data::vector_view<details::index_t> gf_view,
vecmem::data::vector_view<details::index_t> f_backup_view,
Expand Down Expand Up @@ -252,7 +253,7 @@ TRACCC_DEVICE inline void ccl_kernel(
assert(start < num_cells);
std::size_t end =
std::min(num_cells, start + cfg.target_partition_size());
outi = 0;
smem.outi = 0;

/*
* Next, shift the starting point to a position further in the
Expand Down Expand Up @@ -280,9 +281,9 @@ TRACCC_DEVICE inline void ccl_kernel(
cells_device[end - 1].channel1 + 1) {
++end;
}
partition_start = start;
partition_end = end;
assert(partition_start <= partition_end);
smem.partition_start = start;
smem.partition_end = end;
assert(smem.partition_start <= smem.partition_end);
}

barrier.blockBarrier();
Expand All @@ -303,7 +304,7 @@ TRACCC_DEVICE inline void ccl_kernel(
// into a return. As such, we cannot use returns in this kernel.

// Get partition for this thread group
const details::index_t size = partition_end - partition_start;
const details::index_t size = smem.partition_end - smem.partition_start;

// If the size is zero, we can just retire the whole block.
if (size == 0) {
Expand Down Expand Up @@ -342,8 +343,7 @@ TRACCC_DEVICE inline void ccl_kernel(
use_scratch = false;
}

ccl_core(thread_id, partition_start, partition_end,
use_scratch ? f_backup : f_primary,
ccl_core(thread_id, smem, use_scratch ? f_backup : f_primary,
use_scratch ? gf_backup : gf_primary, cell_links, adjv, adjc,
cells_device, modules_device, measurements_device, barrier);

Expand Down
8 changes: 3 additions & 5 deletions device/cuda/src/clusterization/clusterization_algorithm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ __global__ void ccl_kernel(
vecmem::data::vector_view<device::details::index_t> adjv_backup_view,
unsigned int* backup_mutex_ptr) {

__shared__ std::size_t partition_start, partition_end;
__shared__ std::size_t outi;
__shared__ device::details::ccl_kernel_static_smem_parcel shared;
extern __shared__ device::details::index_t shared_v[];
vecmem::device_atomic_ref<unsigned int> backup_mutex(*backup_mutex_ptr);

Expand All @@ -58,9 +57,8 @@ __global__ void ccl_kernel(
traccc::cuda::barrier barry_r;
const cuda::thread_id1 thread_id;

device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
partition_start, partition_end, outi, f_view, gf_view,
f_backup_view, gf_backup_view, adjc_backup_view,
device::ccl_kernel(cfg, thread_id, cells_view, modules_view, shared, f_view,
gf_view, f_backup_view, gf_backup_view, adjc_backup_view,
adjv_backup_view, backup_mutex, barry_r,
measurements_view, cell_links);
}
Expand Down
20 changes: 9 additions & 11 deletions device/sycl/src/clusterization/clusterization_algorithm.sycl
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,16 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
details::get_queue(m_queue)
.submit([&](::sycl::handler& h) {
// Allocate shared memory for the kernel.
vecmem::sycl::local_accessor<std::size_t> shared_uint(3, h);
vecmem::sycl::local_accessor<
device::details::ccl_kernel_static_smem_parcel>
shared(1, h);
vecmem::sycl::local_accessor<device::details::index_t> shared_idx(
2 * m_config.max_partition_size(), h);

// Launch the kernel.
h.parallel_for<kernels::ccl_kernel>(
cclKernelRange,
[shared_uint, shared_idx, cells_view, modules_view,
[shared, shared_idx, cells_view, modules_view,
measurements_view, cell_links_view,
f_backup_view = vecmem::get_data(m_f_backup),
gf_backup_view = vecmem::get_data(m_gf_backup),
Expand All @@ -139,9 +141,6 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
vecmem::data::vector_view<device::details::index_t> gf_view{
static_cast<vector_size_t>(cfg.max_partition_size()),
&shared_idx[cfg.max_partition_size()]};
std::size_t& partition_start = shared_uint[0];
std::size_t& partition_end = shared_uint[1];
std::size_t& outi = shared_uint[2];

// Mutex for scratch space
vecmem::device_atomic_ref<unsigned int> backup_mutex(
Expand All @@ -152,12 +151,11 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
const sycl::thread_id1 thread_id(item);

// Run the algorithm for this thread.
device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
partition_start, partition_end, outi,
f_view, gf_view, f_backup_view,
gf_backup_view, adjc_backup_view,
adjv_backup_view, backup_mutex, barry_r,
measurements_view, cell_links_view);
device::ccl_kernel(
cfg, thread_id, cells_view, modules_view, shared[0],
f_view, gf_view, f_backup_view, gf_backup_view,
adjc_backup_view, adjv_backup_view, backup_mutex,
barry_r, measurements_view, cell_links_view);
});
})
.wait_and_throw();
Expand Down
Loading