Skip to content

Commit

Permalink
Improve robustness and performance of CCL
Browse files Browse the repository at this point in the history
This commit partially addresses acts-project#567. In the past, the CCL kernel was
unable to deal with extremely large partitions. Although this is very
unlikely to happen, our ODD samples contain a few cases of partitions so
large it crashes the code. This commit equips the CCL code with some
scratch memory which it can reserve using a mutex. This allows it enough
space to do its work in global memory. Although this is, of course,
slower, it should happen very infrequently. Parameters can be tuned to
determine that frequency. This commit also contains a few optimizations
to the code which reduce the running time on a μ = 200 event from about
1100 microseconds to 700 microseconds on an RTX A5000.
  • Loading branch information
stephenswat committed Jun 28, 2024
1 parent 0ef502d commit b2874be
Show file tree
Hide file tree
Showing 28 changed files with 422 additions and 212 deletions.
32 changes: 32 additions & 0 deletions core/include/traccc/clusterization/clustering_config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

#include <cstdint>

namespace traccc {
struct clustering_config {
unsigned int threads_per_partition;
unsigned int max_cells_per_thread;
unsigned int target_cells_per_thread;
unsigned int backup_size_multiplier;

constexpr std::size_t max_partition_size() const {
return threads_per_partition * max_cells_per_thread;
}

constexpr std::size_t target_partition_size() const {
return threads_per_partition * target_cells_per_thread;
}

constexpr std::size_t backup_size() const {
return max_partition_size() * backup_size_multiplier;
}
};
} // namespace traccc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#pragma once

// Library include(s).
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/measurement_creation_algorithm.hpp"
#include "traccc/clusterization/sparse_ccl_algorithm.hpp"
#include "traccc/edm/cell.hpp"
Expand All @@ -19,6 +20,7 @@

// System include(s).
#include <functional>
#include <variant>

namespace traccc::host {

Expand All @@ -33,6 +35,8 @@ class clusterization_algorithm
const cell_module_collection_types::const_view&)> {

public:
using config_type = std::monostate;

/// Clusterization algorithm constructor
///
/// @param mr The memory resource to use for the result objects
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ TRACCC_HOST_DEVICE
inline void aggregate_cluster(
const cell_collection_types::const_device& cells,
const cell_module_collection_types::const_device& modules,
const vecmem::data::vector_view<const unsigned short>& f_view,
unsigned int start, unsigned int end, unsigned short cid, measurement& out,
const vecmem::device_vector<const details::index_t>& f, unsigned int start,
unsigned int end, unsigned short cid, measurement& out,
vecmem::data::vector_view<unsigned int> cell_links, unsigned int link);

} // namespace traccc::device
Expand Down
75 changes: 23 additions & 52 deletions device/common/include/traccc/clusterization/device/ccl_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#pragma once

// Project include(s).
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/definitions/qualifiers.hpp"
#include "traccc/edm/cell.hpp"
#include "traccc/edm/measurement.hpp"
Expand All @@ -22,79 +24,48 @@

namespace traccc::device {

namespace details {

/// These indices in clusterization will only range from 0 to
/// max_cells_per_partition, so we only need a short
using index_t = unsigned short;

static constexpr int TARGET_CELLS_PER_THREAD = 8;
static constexpr int MAX_CELLS_PER_THREAD = 32;

/// Helper struct for calculating some of the input parameters of @c ccl_kernel
struct ccl_kernel_helper {

/// Constructor setting the helper parameters
///
/// @param[in] target_cells_per_partition Target average number of cells per
/// thread block
/// @param[in] n_cells Total number of cells
///
ccl_kernel_helper(index_t target_cells_per_partition,
unsigned int n_cells) {

max_cells_per_partition =
(target_cells_per_partition * MAX_CELLS_PER_THREAD +
TARGET_CELLS_PER_THREAD - 1) /
TARGET_CELLS_PER_THREAD;
threads_per_partition =
(target_cells_per_partition + TARGET_CELLS_PER_THREAD - 1) /
TARGET_CELLS_PER_THREAD;
num_partitions = (n_cells + target_cells_per_partition - 1) /
target_cells_per_partition;
}

/// Maximum number of cells per partition
index_t max_cells_per_partition;
/// Number of threads per partition
unsigned int threads_per_partition;
/// Number of partitions
unsigned int num_partitions;

}; // struct ccl_kernel_helper

} // namespace details

/// Function which reads raw detector cells and turns them into measurements.
///
/// @param[in] cfg clustering configuration
/// @param[in] threadId current thread index
/// @param[in] blckDim current thread block size
/// @param[in] blckId current thread block index
/// @param[in] cells_view collection of cells
/// @param[in] modules_view collection of modules to which the cells are linked
/// @param[in] max_cells_per_partition maximum number of cells per thread block
/// @param[in] target_cells_per_partition average number of cells per thread
/// block
/// @param partition_start partition start point for this thread block
/// @param partition_end partition end point for this thread block
/// @param outi number of measurements for this partition
/// @param f_view array of "parent" indices for all cells in this partition
/// @param gf_view array of "grandparent" indices for all cells in this
/// partition
/// @param f_backup_view global memory alternative to `f_view` for cases in
/// which that array is not large enough
/// @param gf_backup_view global memory alternative to `gf_view` for cases in
/// which that array is not large enough
/// @param adjc_backup_view global memory alternative to the adjacent cell
/// count vector
/// @param adjv_backup_view global memory alternative to the cell adjacency
/// matrix fragment storage
/// @param backup_mutex mutex lock to mediate control over the backup global
/// memory data structures.
/// @param barrier A generic object for block-wide synchronisation
/// @param[out] measurements_view collection of measurements
/// @param[out] cell_links collection of links to measurements each cell is
/// put into
template <typename barrier_t>
TRACCC_DEVICE inline void ccl_kernel(
details::index_t threadId, details::index_t blckDim, unsigned int blockId,
const clustering_config cfg, details::index_t threadId,
details::index_t blckDim, unsigned int blockId,
const cell_collection_types::const_view cells_view,
const cell_module_collection_types::const_view modules_view,
const details::index_t max_cells_per_partition,
const details::index_t target_cells_per_partition,
unsigned int& partition_start, unsigned int& partition_end,
unsigned int& outi, vecmem::data::vector_view<details::index_t> f_view,
vecmem::data::vector_view<details::index_t> gf_view, barrier_t& barrier,
std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi,
vecmem::data::vector_view<details::index_t> f_view,
vecmem::data::vector_view<details::index_t> gf_view,
vecmem::data::vector_view<details::index_t> f_backup_view,
vecmem::data::vector_view<details::index_t> gf_backup_view,
vecmem::data::vector_view<unsigned char> adjc_backup_view,
vecmem::data::vector_view<details::index_t> adjv_backup_view,
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

namespace traccc::device::details {
/// These indices in clusterization will only range from 0 to
/// max_cells_per_partition, so we only need a short
using index_t = unsigned short;

/// The limit on the stack size in terms of cells per thread.
static constexpr std::size_t CELLS_PER_THREAD_STACK_LIMIT = 32;
} // namespace traccc::device::details
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,10 @@ TRACCC_HOST_DEVICE
inline void aggregate_cluster(
const cell_collection_types::const_device& cells,
const cell_module_collection_types::const_device& modules,
const vecmem::data::vector_view<const unsigned short>& f_view,
const vecmem::device_vector<const details::index_t>& f,
const unsigned int start, const unsigned int end, const unsigned short cid,
measurement& out, vecmem::data::vector_view<unsigned int> cell_links,
const unsigned int link) {

const vecmem::device_vector<const unsigned short> f(f_view);
vecmem::device_vector<unsigned int> cell_links_device(cell_links);

/*
Expand Down
Loading

0 comments on commit b2874be

Please sign in to comment.