Improve robustness and performance of CCL

This commit partially addresses acts-project#567. In the past, the CCL kernel was unable to deal with extremely large partitions. Although this is very unlikely to happen, our ODD samples contain a few cases of partitions so large it crashes the code. This commit equips the CCL code with some scratch memory which it can reserve using a mutex. This allows it enough space to do its work in global memory. Although this is, of course, slower, it should happen very infrequently. Parameters can be tuned to determine that frequency. This commit also contains a few optimizations to the code which reduce the running time on a μ = 200 event from about 1100 microseconds to 700 microseconds on an RTX A5000.
stephenswat · Jun 5, 2024 · a84cfd8 · a84cfd8
1 parent 15baf6d
commit a84cfd8
Show file tree

Hide file tree

Showing 21 changed files with 242 additions and 94 deletions.
diff --git a/core/include/traccc/clusterization/clustering_config.hpp b/core/include/traccc/clusterization/clustering_config.hpp
@@ -0,0 +1,24 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+namespace traccc {
+struct clustering_config {
+    clustering_config(unsigned int _target_cells_per_partition = 2048,
+                      unsigned int _target_cells_per_thread = 8,
+                      unsigned int _backup_partition_size = 256 * 4096)
+        : target_cells_per_partition(_target_cells_per_partition),
+          target_cells_per_thread(_target_cells_per_thread),
+          backup_partition_size(_backup_partition_size) {}
+
+    unsigned int target_cells_per_partition;
+    unsigned int target_cells_per_thread;
+    unsigned int backup_partition_size;
+};
+}  // namespace traccc
diff --git a/core/include/traccc/clusterization/clusterization_algorithm.hpp b/core/include/traccc/clusterization/clusterization_algorithm.hpp
@@ -8,6 +8,7 @@
 #pragma once
 
 // Library include(s).
+#include "traccc/clusterization/clustering_config.hpp"
 #include "traccc/clusterization/measurement_creation_algorithm.hpp"
 #include "traccc/clusterization/sparse_ccl_algorithm.hpp"
 #include "traccc/edm/cell.hpp"
@@ -33,6 +34,8 @@ class clusterization_algorithm
           const cell_module_collection_types::const_view&)> {
 
     public:
+    using config_type = clustering_config;
+
     /// Clusterization algorithm constructor
     ///
     /// @param mr The memory resource to use for the result objects

diff --git a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp
@@ -8,6 +8,7 @@
 #pragma once
 
 // Project include(s).
+#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
 #include "traccc/definitions/qualifiers.hpp"
 #include "traccc/edm/cell.hpp"
 #include "traccc/edm/measurement.hpp"
@@ -23,33 +24,29 @@
 namespace traccc::device {
 
 namespace details {
-
-/// These indices in clusterization will only range from 0 to
-/// max_cells_per_partition, so we only need a short
-using index_t = unsigned short;
-
-static constexpr int TARGET_CELLS_PER_THREAD = 8;
-static constexpr int MAX_CELLS_PER_THREAD = 32;
+static constexpr int MAX_CELLS_PER_THREAD = 16;
 
 /// Helper struct for calculating some of the input parameters of @c ccl_kernel
 struct ccl_kernel_helper {
-
     /// Constructor setting the helper parameters
     ///
     /// @param[in] target_cells_per_partition Target average number of cells per
-    ///                                       thread block
+    ///     thread block
+    /// @param[in] target_cells_per_thread Target average number of cells per
+    ///     thread
     /// @param[in] n_cells Total number of cells
-    ///
     ccl_kernel_helper(index_t target_cells_per_partition,
-                      unsigned int n_cells) {
-
+                      index_t target_cells_per_thread, unsigned int n_cells) {
+        /// Shared memory size
         max_cells_per_partition =
             (target_cells_per_partition * MAX_CELLS_PER_THREAD +
-             TARGET_CELLS_PER_THREAD - 1) /
-            TARGET_CELLS_PER_THREAD;
+             target_cells_per_thread - 1) /
+            target_cells_per_thread;
+        /// Block size
         threads_per_partition =
-            (target_cells_per_partition + TARGET_CELLS_PER_THREAD - 1) /
-            TARGET_CELLS_PER_THREAD;
+            (target_cells_per_partition + target_cells_per_thread - 1) /
+            target_cells_per_thread;
+        /// Grid size
         num_partitions = (n_cells + target_cells_per_partition - 1) /
                          target_cells_per_partition;
     }
@@ -81,6 +78,16 @@ struct ccl_kernel_helper {
 /// @param f_view  array of "parent" indices for all cells in this partition
 /// @param gf_view array of "grandparent" indices for all cells in this
 ///                partition
+/// @param f_backup_view global memory alternative to `f_view` for cases in
+///     which that array is not large enough
+/// @param gf_backup_view global memory alternative to `gf_view` for cases in
+///     which that array is not large enough
+/// @param adjc_backup_view global memory alternative to the adjacent cell
+///     count vector
+/// @param adjv_backup_view global memory alternative to the cell adjacency
+///     matrix fragment storage
+/// @param backup_mutex mutex lock to mediate control over the backup global
+///     memory data structures.
 /// @param barrier  A generic object for block-wide synchronisation
 /// @param[out] measurements_view collection of measurements
 /// @param[out] cell_links    collection of links to measurements each cell is
@@ -94,7 +101,12 @@ TRACCC_DEVICE inline void ccl_kernel(
     const details::index_t target_cells_per_partition,
     unsigned int& partition_start, unsigned int& partition_end,
     unsigned int& outi, vecmem::data::vector_view<details::index_t> f_view,
-    vecmem::data::vector_view<details::index_t> gf_view, barrier_t& barrier,
+    vecmem::data::vector_view<details::index_t> gf_view,
+    vecmem::data::vector_view<details::index_t> f_backup_view,
+    vecmem::data::vector_view<details::index_t> gf_backup_view,
+    vecmem::data::vector_view<unsigned char> adjc_backup_view,
+    vecmem::data::vector_view<details::index_t> adjv_backup_view,
+    vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
     measurement_collection_types::view measurements_view,
     vecmem::data::vector_view<unsigned int> cell_links);
 

diff --git a/device/common/include/traccc/clusterization/device/ccl_kernel_definitions.hpp b/device/common/include/traccc/clusterization/device/ccl_kernel_definitions.hpp
@@ -0,0 +1,15 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+namespace traccc::device::details {
+/// These indices in clusterization will only range from 0 to
+/// max_cells_per_partition, so we only need a short
+using index_t = unsigned short;
+}  // namespace traccc::device::details
diff --git a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
@@ -9,6 +9,7 @@
 
 #include "traccc/clusterization/device/aggregate_cluster.hpp"
 #include "traccc/clusterization/device/reduce_problem_cell.hpp"
+#include "vecmem/memory/device_atomic_ref.hpp"
 
 namespace traccc::device {
 
@@ -33,13 +34,13 @@ namespace traccc::device {
 /// @param[in] barrier  A generic object for block-wide synchronisation
 ///
 template <typename barrier_t>
-TRACCC_DEVICE void fast_sv_1(
-    vecmem::device_vector<details::index_t>& f,
-    vecmem::device_vector<details::index_t>& gf,
-    unsigned char adjc[details::MAX_CELLS_PER_THREAD],
-    details::index_t adjv[details::MAX_CELLS_PER_THREAD][8],
-    const details::index_t tid, const details::index_t blckDim,
-    barrier_t& barrier) {
+TRACCC_DEVICE void fast_sv_1(vecmem::device_vector<details::index_t>& f,
+                             vecmem::device_vector<details::index_t>& gf,
+                             unsigned char* adjc, details::index_t* adjv,
+                             details::index_t thread_cell_count,
+                             const details::index_t tid,
+                             const details::index_t blckDim,
+                             barrier_t& barrier) {
     /*
      * The algorithm finishes if an iteration leaves the arrays unchanged.
      * This varible will be set if a change is made, and dictates if another
@@ -61,13 +62,12 @@ TRACCC_DEVICE void fast_sv_1(
          * cluster ID if it is lower than ours, essentially merging the two
          * together.
          */
-        for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
-             ++tst) {
+        for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
             const details::index_t cid = tst * blckDim + tid;
 
             __builtin_assume(adjc[tst] <= 8);
             for (unsigned char k = 0; k < adjc[tst]; ++k) {
-                details::index_t q = gf.at(adjv[tst][k]);
+                details::index_t q = gf.at(adjv[8 * tst + k]);
 
                 if (gf.at(cid) > q) {
                     f.at(f.at(cid)) = q;
@@ -83,8 +83,7 @@ TRACCC_DEVICE void fast_sv_1(
         barrier.blockBarrier();
 
 #pragma unroll
-        for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
-             ++tst) {
+        for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
             const details::index_t cid = tst * blckDim + tid;
             /*
              * The second stage is shortcutting, which is an optimisation that
@@ -102,8 +101,7 @@ TRACCC_DEVICE void fast_sv_1(
         barrier.blockBarrier();
 
 #pragma unroll
-        for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD;
-             ++tst) {
+        for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
             const details::index_t cid = tst * blckDim + tid;
             /*
              * Update the array for the next generation, keeping track of any
@@ -135,17 +133,24 @@ TRACCC_DEVICE inline void ccl_kernel(
     const details::index_t target_cells_per_partition,
     unsigned int& partition_start, unsigned int& partition_end,
     unsigned int& outi, vecmem::data::vector_view<details::index_t> f_view,
-    vecmem::data::vector_view<details::index_t> gf_view, barrier_t& barrier,
+    vecmem::data::vector_view<details::index_t> gf_view,
+    vecmem::data::vector_view<details::index_t> f_backup_view,
+    vecmem::data::vector_view<details::index_t> gf_backup_view,
+    vecmem::data::vector_view<unsigned char> adjc_backup_view,
+    vecmem::data::vector_view<details::index_t> adjv_backup_view,
+    vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
     measurement_collection_types::view measurements_view,
     vecmem::data::vector_view<unsigned int> cell_links) {
-
     // Construct device containers around the views.
     const cell_collection_types::const_device cells_device(cells_view);
     const cell_module_collection_types::const_device modules_device(
         modules_view);
     measurement_collection_types::device measurements_device(measurements_view);
     vecmem::device_vector<details::index_t> f(f_view);
     vecmem::device_vector<details::index_t> gf(gf_view);
+    vecmem::device_vector<unsigned char> adjc_backup(adjc_backup_view);
+    vecmem::device_vector<details::index_t> adjv_backup(adjv_backup_view);
+    bool using_backup_memory = false;
 
     const cell_collection_types::const_device::size_type num_cells =
         cells_device.size();
@@ -199,41 +204,73 @@ TRACCC_DEVICE inline void ccl_kernel(
     barrier.blockBarrier();
 
     // Vector of indices of the adjacent cells
-    details::index_t adjv[details::MAX_CELLS_PER_THREAD][8];
+    details::index_t _adjv[details::MAX_CELLS_PER_THREAD * 8];
+    details::index_t* adjv = _adjv;
+
     /*
      * The number of adjacent cells for each cell must start at zero, to
      * avoid uninitialized memory. adjv does not need to be zeroed, as
      * we will only access those values if adjc indicates that the value
      * is set.
      */
     // Number of adjacent cells
-    unsigned char adjc[details::MAX_CELLS_PER_THREAD];
+    unsigned char _adjc[details::MAX_CELLS_PER_THREAD];
+    unsigned char* adjc = _adjc;
 
     // It seems that sycl runs into undefined behaviour when calling
     // group synchronisation functions when some threads have already run
     // into a return. As such, we cannot use returns in this kernel.
 
     // Get partition for this thread group
     const details::index_t size = partition_end - partition_start;
-    assert(size <= max_cells_per_partition);
+
+    // If our partition is too large, we need to handle this specific edge
+    // case. The first thread of the block will attempt to enter a critical
+    // section by obtaining a lock on a mutex in global memory. When this is
+    // obtained, we can use some memory in global memory instead of the shared
+    // memory. This can be done more efficiently, but this should be a very
+    // rare edge case.
+    if (size > max_cells_per_partition) {
+        if (threadId == 0) {
+            uint32_t false_int = 0;
+            while (backup_mutex.compare_exchange_strong(false_int, 1u)) {
+            }
+        }
+
+        barrier.blockBarrier();
+
+        f = f_backup_view;
+        gf = gf_backup_view;
+        adjc = adjc_backup.data();
+        adjv = adjv_backup.data();
+        using_backup_memory = true;
+    }
+
+    assert(size <= f.size());
+    assert(size <= gf.size());
+
+    details::index_t thread_cell_count = 0;
+    for (details::index_t cid;
+         (cid = thread_cell_count * blckDim + threadId) < size;
+         ++thread_cell_count) {
+    }
 
 #pragma unroll
-    for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD; ++tst) {
+    for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
         adjc[tst] = 0;
     }
 
-    for (details::index_t tst = 0, cid; (cid = tst * blckDim + threadId) < size;
-         ++tst) {
+    for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
         /*
          * Look for adjacent cells to the current one.
          */
-        assert(tst < details::MAX_CELLS_PER_THREAD);
+        const details::index_t cid = tst * blckDim + threadId;
         reduce_problem_cell(cells_device, cid, partition_start, partition_end,
-                            adjc[tst], adjv[tst]);
+                            adjc[tst], &adjv[8 * tst]);
     }
 
 #pragma unroll
-    for (details::index_t tst = 0; tst < details::MAX_CELLS_PER_THREAD; ++tst) {
+    for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
         const details::index_t cid = tst * blckDim + threadId;
         /*
          * At the start, the values of f and gf should be equal to the
@@ -253,12 +290,13 @@ TRACCC_DEVICE inline void ccl_kernel(
      * Run FastSV algorithm, which will update the father index to that of
      * the cell belonging to the same cluster with the lowest index.
      */
-    fast_sv_1(f, gf, adjc, adjv, threadId, blckDim, barrier);
+    fast_sv_1(f, gf, &adjc[0], &adjv[0], thread_cell_count, threadId, blckDim,
+              barrier);
 
     barrier.blockBarrier();
 
-    for (details::index_t tst = 0, cid; (cid = tst * blckDim + threadId) < size;
-         ++tst) {
+    for (details::index_t tst = 0; tst < thread_cell_count; ++tst) {
+        const details::index_t cid = tst * blckDim + threadId;
         if (f.at(cid) == cid) {
             // Add a new measurement to the output buffer. Remembering its
             // position inside of the container.
@@ -271,6 +309,13 @@ TRACCC_DEVICE inline void ccl_kernel(
                               meas_pos);
         }
     }
+
+    // Recall that we might be holding a mutex on some global memory. If we
+    // are, make sure to release it here so that any future kernels trying to
+    // use that memory don't get stuck in a loop forever.
+    if (threadId == 0 && using_backup_memory) {
+        backup_mutex.store(0);
+    }
 }
 
 }  // namespace traccc::device
diff --git a/device/common/include/traccc/clusterization/device/impl/reduce_problem_cell.ipp b/device/common/include/traccc/clusterization/device/impl/reduce_problem_cell.ipp
@@ -19,7 +19,7 @@ TRACCC_HOST_DEVICE
 inline void reduce_problem_cell(
     const cell_collection_types::const_device& cells, const unsigned short cid,
     const unsigned int start, const unsigned int end, unsigned char& adjc,
-    unsigned short adjv[8]) {
+    unsigned short* adjv) {
 
     // Some sanity check(s).
     assert(start <= end);

diff --git a/device/common/include/traccc/clusterization/device/reduce_problem_cell.hpp b/device/common/include/traccc/clusterization/device/reduce_problem_cell.hpp
@@ -28,7 +28,7 @@ TRACCC_HOST_DEVICE
 inline void reduce_problem_cell(
     const cell_collection_types::const_device& cells, unsigned short cid,
     unsigned int start, unsigned int end, unsigned char& adjc,
-    unsigned short adjv[8]);
+    unsigned short* adjv);
 
 }  // namespace traccc::device