Skip to content

Commit

Permalink
Improve robustness and performance of CCL
Browse files Browse the repository at this point in the history
This commit partially addresses acts-project#567. In the past, the CCL kernel was
unable to deal with extremely large partitions. Although this is very
unlikely to happen, our ODD samples contain a few cases of partitions so
large it crashes the code. This commit equips the CCL code with some
scratch memory which it can reserve using a mutex. This allows it enough
space to do its work in global memory. Although this is, of course,
slower, it should happen very infrequently. Parameters can be tuned to
determine that frequency. This commit also contains a few optimizations
to the code which reduce the running time on a μ = 200 event from about
1100 microseconds to 700 microseconds on an RTX A5000.
  • Loading branch information
stephenswat committed Jul 10, 2024
1 parent a9f2e8c commit 87c8735
Show file tree
Hide file tree
Showing 28 changed files with 527 additions and 245 deletions.
78 changes: 78 additions & 0 deletions core/include/traccc/clusterization/clustering_config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

#include <cstdint>

#include "traccc/definitions/qualifiers.hpp"

namespace traccc {
/**
* @brief Configuration type for massively parallel clustering algorithms.
*/
struct clustering_config {
/**
* @brief The desired number of threads per partition.
*
* This directly correlates to the block size on most algorithms, so don't
* set this too low (which will reduce occupancy due to available thread
* slots) or too high (which may not be supported on a device).
*/
unsigned int threads_per_partition;

/**
* @brief The maximum number of cells per thread.
*
* This sets the maximum thread coarsening factor for the CCA algorithm.
* Increasing this value increases shared memory usage and may decrease
* occupancy. If this is too low, scratch space will need to be used which
* may slow the algorithm down.
*/
unsigned int max_cells_per_thread;

/**
* @brief The desired number of cells per thread.
*
* This sets the desired thread coarsening factor for the CCA algorithm.
* Decreasing this may decrease occupancy. Increasing this increases the
* probability that scratch space will need to be used.
*/
unsigned int target_cells_per_thread;

/**
* @brief The upscaling factor for the scratch space.
*
* The scratch space will be large enough to support partitions this number
* of times larger than the maximum partition size determined by
* `threads_per_partition` and `max_cells_per_thread`
*/
unsigned int backup_size_multiplier;

/**
* @brief The maximum number of cells per partition.
*/
TRACCC_HOST_DEVICE constexpr std::size_t max_partition_size() const {
return threads_per_partition * max_cells_per_thread;
}

/**
* @brief The target number of cells per partition.
*/
TRACCC_HOST_DEVICE constexpr std::size_t target_partition_size() const {
return threads_per_partition * target_cells_per_thread;
}

/**
* @brief The total size of the scratch space, in number of cells.
*/
TRACCC_HOST_DEVICE constexpr std::size_t backup_size() const {
return max_partition_size() * backup_size_multiplier;
}
};
} // namespace traccc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#pragma once

// Library include(s).
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/measurement_creation_algorithm.hpp"
#include "traccc/clusterization/sparse_ccl_algorithm.hpp"
#include "traccc/edm/cell.hpp"
Expand All @@ -19,6 +20,7 @@

// System include(s).
#include <functional>
#include <variant>

namespace traccc::host {

Expand All @@ -33,6 +35,8 @@ class clusterization_algorithm
const cell_module_collection_types::const_view&)> {

public:
using config_type = std::monostate;

/// Clusterization algorithm constructor
///
/// @param mr The memory resource to use for the result objects
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ TRACCC_HOST_DEVICE
inline void aggregate_cluster(
const cell_collection_types::const_device& cells,
const cell_module_collection_types::const_device& modules,
const vecmem::data::vector_view<const unsigned short>& f_view,
unsigned int start, unsigned int end, unsigned short cid, measurement& out,
const vecmem::device_vector<details::index_t>& f_view, unsigned int start,
unsigned int end, unsigned short cid, measurement& out,
vecmem::data::vector_view<unsigned int> cell_links, unsigned int link);

} // namespace traccc::device
Expand Down
75 changes: 23 additions & 52 deletions device/common/include/traccc/clusterization/device/ccl_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#pragma once

// Project include(s).
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/definitions/hints.hpp"
#include "traccc/definitions/qualifiers.hpp"
#include "traccc/device/concepts/barrier.hpp"
Expand All @@ -24,79 +26,48 @@

namespace traccc::device {

namespace details {

/// These indices in clusterization will only range from 0 to
/// max_cells_per_partition, so we only need a short
using index_t = unsigned short;

static constexpr int TARGET_CELLS_PER_THREAD = 8;
static constexpr int MAX_CELLS_PER_THREAD = 32;

/// Helper struct for calculating some of the input parameters of @c ccl_kernel
struct ccl_kernel_helper {

/// Constructor setting the helper parameters
///
/// @param[in] target_cells_per_partition Target average number of cells per
/// thread block
/// @param[in] n_cells Total number of cells
///
ccl_kernel_helper(index_t target_cells_per_partition,
unsigned int n_cells) {

max_cells_per_partition =
(target_cells_per_partition * MAX_CELLS_PER_THREAD +
TARGET_CELLS_PER_THREAD - 1) /
TARGET_CELLS_PER_THREAD;
threads_per_partition =
(target_cells_per_partition + TARGET_CELLS_PER_THREAD - 1) /
TARGET_CELLS_PER_THREAD;
num_partitions = (n_cells + target_cells_per_partition - 1) /
target_cells_per_partition;
}

/// Maximum number of cells per partition
index_t max_cells_per_partition;
/// Number of threads per partition
unsigned int threads_per_partition;
/// Number of partitions
unsigned int num_partitions;

}; // struct ccl_kernel_helper

} // namespace details

/// Function which reads raw detector cells and turns them into measurements.
///
/// @param[in] cfg clustering configuration
/// @param[in] threadId current thread index
/// @param[in] blckDim current thread block size
/// @param[in] blckId current thread block index
/// @param[in] cells_view collection of cells
/// @param[in] modules_view collection of modules to which the cells are linked
/// @param[in] max_cells_per_partition maximum number of cells per thread block
/// @param[in] target_cells_per_partition average number of cells per thread
/// block
/// @param partition_start partition start point for this thread block
/// @param partition_end partition end point for this thread block
/// @param outi number of measurements for this partition
/// @param f_view array of "parent" indices for all cells in this partition
/// @param gf_view array of "grandparent" indices for all cells in this
/// partition
/// @param f_backup_view global memory alternative to `f_view` for cases in
/// which that array is not large enough
/// @param gf_backup_view global memory alternative to `gf_view` for cases in
/// which that array is not large enough
/// @param adjc_backup_view global memory alternative to the adjacent cell
/// count vector
/// @param adjv_backup_view global memory alternative to the cell adjacency
/// matrix fragment storage
/// @param backup_mutex mutex lock to mediate control over the backup global
/// memory data structures.
/// @param barrier A generic object for block-wide synchronisation
/// @param[out] measurements_view collection of measurements
/// @param[out] cell_links collection of links to measurements each cell is
/// put into
template <TRACCC_CONSTRAINT(device::concepts::barrier) barrier_t>
TRACCC_DEVICE inline void ccl_kernel(
details::index_t threadId, details::index_t blckDim, unsigned int blockId,
const clustering_config cfg, details::index_t threadId,
details::index_t blckDim, unsigned int blockId,
const cell_collection_types::const_view cells_view,
const cell_module_collection_types::const_view modules_view,
const details::index_t max_cells_per_partition,
const details::index_t target_cells_per_partition,
unsigned int& partition_start, unsigned int& partition_end,
unsigned int& outi, vecmem::data::vector_view<details::index_t> f_view,
vecmem::data::vector_view<details::index_t> gf_view, barrier_t& barrier,
std::size_t& partition_start, std::size_t& partition_end, std::size_t& outi,
vecmem::data::vector_view<details::index_t> f_view,
vecmem::data::vector_view<details::index_t> gf_view,
vecmem::data::vector_view<details::index_t> f_backup_view,
vecmem::data::vector_view<details::index_t> gf_backup_view,
vecmem::data::vector_view<unsigned char> adjc_backup_view,
vecmem::data::vector_view<details::index_t> adjv_backup_view,
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

namespace traccc::device::details {
/// These indices in clusterization will only range from 0 to
/// max_cells_per_partition, so we only need a short
using index_t = unsigned short;

/// The limit on the stack size in terms of cells per thread.
static constexpr std::size_t CELLS_PER_THREAD_STACK_LIMIT = 32;
} // namespace traccc::device::details
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,10 @@ TRACCC_HOST_DEVICE
inline void aggregate_cluster(
const cell_collection_types::const_device& cells,
const cell_module_collection_types::const_device& modules,
const vecmem::data::vector_view<const unsigned short>& f_view,
const unsigned int start, const unsigned int end, const unsigned short cid,
measurement& out, vecmem::data::vector_view<unsigned int> cell_links,
const vecmem::device_vector<details::index_t>& f, const unsigned int start,
const unsigned int end, const unsigned short cid, measurement& out,
vecmem::data::vector_view<unsigned int> cell_links,
const unsigned int link) {

const vecmem::device_vector<const unsigned short> f(f_view);
vecmem::device_vector<unsigned int> cell_links_device(cell_links);

/*
Expand Down
Loading

0 comments on commit 87c8735

Please sign in to comment.