Skip to content

Commit

Permalink
Improve robustness and performance of CCL
Browse files Browse the repository at this point in the history
This commit partially addresses acts-project#567. In the past, the CCL kernel was
unable to deal with extremely large partitions. Although this is very
unlikely to happen, our ODD samples contain a few cases of partitions so
large it crashes the code. This commit equips the CCL code with some
scratch memory which it can reserve using a mutex. This allows it enough
space to do its work in global memory. Although this is, of course,
slower, it should happen very infrequently. Parameters can be tuned to
determine that frequency. This commit also contains a few optimizations
to the code which reduce the running time on a μ = 200 event from about
1100 microseconds to 700 microseconds on an RTX A5000.
  • Loading branch information
stephenswat committed Jun 5, 2024
1 parent 15baf6d commit a84cfd8
Show file tree
Hide file tree
Showing 21 changed files with 242 additions and 94 deletions.
24 changes: 24 additions & 0 deletions core/include/traccc/clusterization/clustering_config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

namespace traccc {
struct clustering_config {
clustering_config(unsigned int _target_cells_per_partition = 2048,
unsigned int _target_cells_per_thread = 8,
unsigned int _backup_partition_size = 256 * 4096)
: target_cells_per_partition(_target_cells_per_partition),
target_cells_per_thread(_target_cells_per_thread),
backup_partition_size(_backup_partition_size) {}

unsigned int target_cells_per_partition;
unsigned int target_cells_per_thread;
unsigned int backup_partition_size;
};
} // namespace traccc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#pragma once

// Library include(s).
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/measurement_creation_algorithm.hpp"
#include "traccc/clusterization/sparse_ccl_algorithm.hpp"
#include "traccc/edm/cell.hpp"
Expand All @@ -33,6 +34,8 @@ class clusterization_algorithm
const cell_module_collection_types::const_view&)> {

public:
using config_type = clustering_config;

/// Clusterization algorithm constructor
///
/// @param mr The memory resource to use for the result objects
Expand Down
46 changes: 29 additions & 17 deletions device/common/include/traccc/clusterization/device/ccl_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#pragma once

// Project include(s).
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/definitions/qualifiers.hpp"
#include "traccc/edm/cell.hpp"
#include "traccc/edm/measurement.hpp"
Expand All @@ -23,33 +24,29 @@
namespace traccc::device {

namespace details {

/// These indices in clusterization will only range from 0 to
/// max_cells_per_partition, so we only need a short
using index_t = unsigned short;

static constexpr int TARGET_CELLS_PER_THREAD = 8;
static constexpr int MAX_CELLS_PER_THREAD = 32;
static constexpr int MAX_CELLS_PER_THREAD = 16;

/// Helper struct for calculating some of the input parameters of @c ccl_kernel
struct ccl_kernel_helper {

/// Constructor setting the helper parameters
///
/// @param[in] target_cells_per_partition Target average number of cells per
/// thread block
/// thread block
/// @param[in] target_cells_per_thread Target average number of cells per
/// thread
/// @param[in] n_cells Total number of cells
///
ccl_kernel_helper(index_t target_cells_per_partition,
unsigned int n_cells) {

index_t target_cells_per_thread, unsigned int n_cells) {
/// Shared memory size
max_cells_per_partition =
(target_cells_per_partition * MAX_CELLS_PER_THREAD +
TARGET_CELLS_PER_THREAD - 1) /
TARGET_CELLS_PER_THREAD;
target_cells_per_thread - 1) /
target_cells_per_thread;
/// Block size
threads_per_partition =
(target_cells_per_partition + TARGET_CELLS_PER_THREAD - 1) /
TARGET_CELLS_PER_THREAD;
(target_cells_per_partition + target_cells_per_thread - 1) /
target_cells_per_thread;
/// Grid size
num_partitions = (n_cells + target_cells_per_partition - 1) /
target_cells_per_partition;
}
Expand Down Expand Up @@ -81,6 +78,16 @@ struct ccl_kernel_helper {
/// @param f_view array of "parent" indices for all cells in this partition
/// @param gf_view array of "grandparent" indices for all cells in this
/// partition
/// @param f_backup_view global memory alternative to `f_view` for cases in
/// which that array is not large enough
/// @param gf_backup_view global memory alternative to `gf_view` for cases in
/// which that array is not large enough
/// @param adjc_backup_view global memory alternative to the adjacent cell
/// count vector
/// @param adjv_backup_view global memory alternative to the cell adjacency
/// matrix fragment storage
/// @param backup_mutex mutex lock to mediate control over the backup global
/// memory data structures.
/// @param barrier A generic object for block-wide synchronisation
/// @param[out] measurements_view collection of measurements
/// @param[out] cell_links collection of links to measurements each cell is
Expand All @@ -94,7 +101,12 @@ TRACCC_DEVICE inline void ccl_kernel(
const details::index_t target_cells_per_partition,
unsigned int& partition_start, unsigned int& partition_end,
unsigned int& outi, vecmem::data::vector_view<details::index_t> f_view,
vecmem::data::vector_view<details::index_t> gf_view, barrier_t& barrier,
vecmem::data::vector_view<details::index_t> gf_view,
vecmem::data::vector_view<details::index_t> f_backup_view,
vecmem::data::vector_view<details::index_t> gf_backup_view,
vecmem::data::vector_view<unsigned char> adjc_backup_view,
vecmem::data::vector_view<details::index_t> adjv_backup_view,
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

namespace traccc::device::details {
/// These indices in clusterization will only range from 0 to
/// max_cells_per_partition, so we only need a short
using index_t = unsigned short;
} // namespace traccc::device::details
101 changes: 73 additions & 28 deletions device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "traccc/clusterization/device/aggregate_cluster.hpp"
#include "traccc/clusterization/device/reduce_problem_cell.hpp"
#include "vecmem/memory/device_atomic_ref.hpp"

namespace traccc::device {

Expand All @@ -33,13 +34,13 @@ namespace traccc::device {
/// @param[in] barrier A generic object for block-wide synchronisation
///
template <typename barrier_t>
TRACCC_DEVICE void fast_sv_1(
vecmem::device_vector<details::index_t>& f,
vecmem::device_vector<details::index_t>& gf,
unsigned char adjc[details::MAX_CELLS_PER_THREAD],
details::index_t adjv[details::MAX_CELLS_PER_THREAD][8],
const details::index_t tid, const details::index_t blckDim,
barrier_t& barrier) {
TRACCC_DEVICE void fast_sv_1(vecmem::device_vector<details::index_t>& f,
vecmem::device_vector<details::index_t>& gf,
unsigned char* adjc, details::index_t* adjv,
details::index_t thread_cell_count,
const details::index_t tid,
const details::index_t blckDim,
barrier_t& barrier) {
/*
* The algorithm finishes if an iteration leaves the arrays unchanged.
* This varible will be set if a change is made, and dictates if another
Expand All @@ -61,13 +62,12 @@ TRACCC_DEVICE void fast_sv_1(
* cluster ID if it is lower than ours, essentially merging the two
* together.
*/
for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
++tst) {
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
const details::index_t cid = tst * blckDim + tid;

__builtin_assume(adjc[tst] <= 8);
for (unsigned char k = 0; k < adjc[tst]; ++k) {
details::index_t q = gf.at(adjv[tst][k]);
details::index_t q = gf.at(adjv[8 * tst + k]);

if (gf.at(cid) > q) {
f.at(f.at(cid)) = q;
Expand All @@ -83,8 +83,7 @@ TRACCC_DEVICE void fast_sv_1(
barrier.blockBarrier();

#pragma unroll
for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
++tst) {
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
const details::index_t cid = tst * blckDim + tid;
/*
* The second stage is shortcutting, which is an optimisation that
Expand All @@ -102,8 +101,7 @@ TRACCC_DEVICE void fast_sv_1(
barrier.blockBarrier();

#pragma unroll
for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
++tst) {
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
const details::index_t cid = tst * blckDim + tid;
/*
* Update the array for the next generation, keeping track of any
Expand Down Expand Up @@ -135,17 +133,24 @@ TRACCC_DEVICE inline void ccl_kernel(
const details::index_t target_cells_per_partition,
unsigned int& partition_start, unsigned int& partition_end,
unsigned int& outi, vecmem::data::vector_view<details::index_t> f_view,
vecmem::data::vector_view<details::index_t> gf_view, barrier_t& barrier,
vecmem::data::vector_view<details::index_t> gf_view,
vecmem::data::vector_view<details::index_t> f_backup_view,
vecmem::data::vector_view<details::index_t> gf_backup_view,
vecmem::data::vector_view<unsigned char> adjc_backup_view,
vecmem::data::vector_view<details::index_t> adjv_backup_view,
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links) {

// Construct device containers around the views.
const cell_collection_types::const_device cells_device(cells_view);
const cell_module_collection_types::const_device modules_device(
modules_view);
measurement_collection_types::device measurements_device(measurements_view);
vecmem::device_vector<details::index_t> f(f_view);
vecmem::device_vector<details::index_t> gf(gf_view);
vecmem::device_vector<unsigned char> adjc_backup(adjc_backup_view);
vecmem::device_vector<details::index_t> adjv_backup(adjv_backup_view);
bool using_backup_memory = false;

const cell_collection_types::const_device::size_type num_cells =
cells_device.size();
Expand Down Expand Up @@ -199,41 +204,73 @@ TRACCC_DEVICE inline void ccl_kernel(
barrier.blockBarrier();

// Vector of indices of the adjacent cells
details::index_t adjv[details::MAX_CELLS_PER_THREAD][8];
details::index_t _adjv[details::MAX_CELLS_PER_THREAD * 8];
details::index_t* adjv = _adjv;

/*
* The number of adjacent cells for each cell must start at zero, to
* avoid uninitialized memory. adjv does not need to be zeroed, as
* we will only access those values if adjc indicates that the value
* is set.
*/
// Number of adjacent cells
unsigned char adjc[details::MAX_CELLS_PER_THREAD];
unsigned char _adjc[details::MAX_CELLS_PER_THREAD];
unsigned char* adjc = _adjc;

// It seems that sycl runs into undefined behaviour when calling
// group synchronisation functions when some threads have already run
// into a return. As such, we cannot use returns in this kernel.

// Get partition for this thread group
const details::index_t size = partition_end - partition_start;
assert(size <= max_cells_per_partition);

// If our partition is too large, we need to handle this specific edge
// case. The first thread of the block will attempt to enter a critical
// section by obtaining a lock on a mutex in global memory. When this is
// obtained, we can use some memory in global memory instead of the shared
// memory. This can be done more efficiently, but this should be a very
// rare edge case.
if (size > max_cells_per_partition) {
if (threadId == 0) {
uint32_t false_int = 0;
while (backup_mutex.compare_exchange_strong(false_int, 1u)) {
}
}

barrier.blockBarrier();

f = f_backup_view;
gf = gf_backup_view;
adjc = adjc_backup.data();
adjv = adjv_backup.data();
using_backup_memory = true;
}

assert(size <= f.size());
assert(size <= gf.size());

details::index_t thread_cell_count = 0;
for (details::index_t cid;
(cid = thread_cell_count * blckDim + threadId) < size;
++thread_cell_count) {
}

#pragma unroll
for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD; ++tst) {
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
adjc[tst] = 0;
}

for (details::index_t tst = 0, cid; (cid = tst * blckDim + threadId) < size;
++tst) {
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
/*
* Look for adjacent cells to the current one.
*/
assert(tst < details::MAX_CELLS_PER_THREAD);
const details::index_t cid = tst * blckDim + threadId;
reduce_problem_cell(cells_device, cid, partition_start, partition_end,
adjc[tst], adjv[tst]);
adjc[tst], &adjv[8 * tst]);
}

#pragma unroll
for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD; ++tst) {
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
const details::index_t cid = tst * blckDim + threadId;
/*
* At the start, the values of f and gf should be equal to the
Expand All @@ -253,12 +290,13 @@ TRACCC_DEVICE inline void ccl_kernel(
* Run FastSV algorithm, which will update the father index to that of
* the cell belonging to the same cluster with the lowest index.
*/
fast_sv_1(f, gf, adjc, adjv, threadId, blckDim, barrier);
fast_sv_1(f, gf, &adjc[0], &adjv[0], thread_cell_count, threadId, blckDim,
barrier);

barrier.blockBarrier();

for (details::index_t tst = 0, cid; (cid = tst * blckDim + threadId) < size;
++tst) {
for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
const details::index_t cid = tst * blckDim + threadId;
if (f.at(cid) == cid) {
// Add a new measurement to the output buffer. Remembering its
// position inside of the container.
Expand All @@ -271,6 +309,13 @@ TRACCC_DEVICE inline void ccl_kernel(
meas_pos);
}
}

// Recall that we might be holding a mutex on some global memory. If we
// are, make sure to release it here so that any future kernels trying to
// use that memory don't get stuck in a loop forever.
if (threadId == 0 && using_backup_memory) {
backup_mutex.store(0);
}
}

} // namespace traccc::device
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ TRACCC_HOST_DEVICE
inline void reduce_problem_cell(
const cell_collection_types::const_device& cells, const unsigned short cid,
const unsigned int start, const unsigned int end, unsigned char& adjc,
unsigned short adjv[8]) {
unsigned short* adjv) {

// Some sanity check(s).
assert(start <= end);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ TRACCC_HOST_DEVICE
inline void reduce_problem_cell(
const cell_collection_types::const_device& cells, unsigned short cid,
unsigned int start, unsigned int end, unsigned char& adjc,
unsigned short adjv[8]);
unsigned short* adjv);

} // namespace traccc::device

Expand Down
Loading

0 comments on commit a84cfd8

Please sign in to comment.