Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add debug output to clustering algorithms #640

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions core/include/traccc/clusterization/clustering_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ struct clustering_config {
*/
unsigned int backup_size_multiplier;

/**
* @brief Flag to enforce debug output.
*
* @warning This will slown down the clustering algorithm.
*/
bool enable_debug_output;

/**
* @brief The maximum number of cells per partition.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct CCLKernel {
partition_start, partition_end, outi, f_view,
gf_view, f_backup_view, gf_backup_view,
adjc_backup_view, adjv_backup_view, backup_mutex,
barry_r, measurements_view, cell_links);
barry_r, measurements_view, cell_links, nullptr);
}
};

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

#include <cstdint>

namespace traccc::device::details {
struct ccl_debug_output {
uint32_t num_oversized_partitions;

static ccl_debug_output init() {
ccl_debug_output rv;

rv.num_oversized_partitions = 0;

return rv;
}
};
} // namespace traccc::device::details
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

// Project include(s).
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/ccl_debug_output.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/definitions/hints.hpp"
#include "traccc/definitions/qualifiers.hpp"
Expand Down Expand Up @@ -53,6 +54,7 @@ namespace traccc::device {
/// @param[out] measurements_view collection of measurements
/// @param[out] cell_links collection of links to measurements each cell is
/// put into
/// @param[out] debug_output debug output location
template <device::concepts::barrier barrier_t,
device::concepts::thread_id1 thread_id_t>
TRACCC_DEVICE inline void ccl_kernel(
Expand All @@ -68,7 +70,8 @@ TRACCC_DEVICE inline void ccl_kernel(
vecmem::data::vector_view<details::index_t> adjv_backup_view,
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links);
vecmem::data::vector_view<unsigned int> cell_links,
details::ccl_debug_output* debug_output);

} // namespace traccc::device

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/aggregate_cluster.hpp"
#include "traccc/clusterization/device/ccl_debug_output.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/clusterization/device/reduce_problem_cell.hpp"
#include "traccc/device/concepts/barrier.hpp"
Expand Down Expand Up @@ -220,7 +221,8 @@ TRACCC_DEVICE inline void ccl_kernel(
vecmem::data::vector_view<details::index_t> adjv_backup_view,
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links) {
vecmem::data::vector_view<unsigned int> cell_links,
details::ccl_debug_output* debug_output) {
// Construct device containers around the views.
const cell_collection_types::const_device cells_device(cells_view);
const cell_module_collection_types::const_device modules_device(
Expand Down Expand Up @@ -325,6 +327,13 @@ TRACCC_DEVICE inline void ccl_kernel(
if (size > cfg.max_partition_size()) {
if (thread_id.getLocalThreadIdX() == 0) {
lock.lock();

if (debug_output) {
vecmem::device_atomic_ref<uint32_t>
num_oversized_partitions_atm(
debug_output->num_oversized_partitions);
num_oversized_partitions_atm.fetch_add(1);
}
}

barrier.blockBarrier();
Expand Down
56 changes: 51 additions & 5 deletions device/cuda/src/clusterization/clusterization_algorithm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@
*/

// CUDA Library include(s).
#include <cuda_runtime_api.h>
#include <driver_types.h>

#include "../sanity/contiguous_on.cuh"
#include "../sanity/ordered_on.cuh"
#include "../utils/barrier.hpp"
#include "../utils/cuda_error_handling.hpp"
#include "../utils/utils.hpp"
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/ccl_debug_output.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/cuda/clusterization/clusterization_algorithm.hpp"
#include "traccc/cuda/utils/thread_id.hpp"
Expand All @@ -21,6 +25,9 @@
// Project include(s)
#include "traccc/clusterization/device/ccl_kernel.hpp"

// System include
#include <iostream>

// Vecmem include(s).
#include <cstring>
#include <vecmem/utils/copy.hpp>
Expand All @@ -40,7 +47,8 @@ __global__ void ccl_kernel(
vecmem::data::vector_view<device::details::index_t> gf_backup_view,
vecmem::data::vector_view<unsigned char> adjc_backup_view,
vecmem::data::vector_view<device::details::index_t> adjv_backup_view,
unsigned int* backup_mutex_ptr) {
unsigned int* backup_mutex_ptr,
device::details::ccl_debug_output* debug_output) {

__shared__ std::size_t partition_start, partition_end;
__shared__ std::size_t outi;
Expand All @@ -62,7 +70,7 @@ __global__ void ccl_kernel(
partition_start, partition_end, outi, f_view, gf_view,
f_backup_view, gf_backup_view, adjc_backup_view,
adjv_backup_view, backup_mutex, barry_r,
measurements_view, cell_links);
measurements_view, cell_links, debug_output);
}

} // namespace kernels
Expand Down Expand Up @@ -132,14 +140,52 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
assert(m_config.max_cells_per_thread <=
device::details::CELLS_PER_THREAD_STACK_LIMIT);

// If necessary, allocate an object for storing the debug information
vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;

if (m_config.enable_debug_output) {
debug_output =
vecmem::make_unique_alloc<device::details::ccl_debug_output>(
m_mr.main);

device::details::ccl_debug_output empty_output =
device::details::ccl_debug_output::init();

TRACCC_CUDA_ERROR_CHECK(
cudaMemcpyAsync(debug_output.get(), &empty_output,
sizeof(device::details::ccl_debug_output),
cudaMemcpyHostToDevice, stream));
}

kernels::ccl_kernel<<<num_blocks, m_config.threads_per_partition,
2 * m_config.max_partition_size() *
sizeof(device::details::index_t),
stream>>>(
m_config, cells, modules, measurements, cell_links, m_f_backup,
m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get());
stream>>>(m_config, cells, modules, measurements,
cell_links, m_f_backup, m_gf_backup,
m_adjc_backup, m_adjv_backup,
m_backup_mutex.get(), debug_output.get());
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());

if (debug_output) {
device::details::ccl_debug_output host_output;

TRACCC_CUDA_ERROR_CHECK(
cudaMemcpyAsync(&host_output, debug_output.get(),
sizeof(device::details::ccl_debug_output),
cudaMemcpyDeviceToHost, stream));

TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream));

if (host_output.num_oversized_partitions > 0) {
std::cout << "WARNING: @clusterization_algorithm: "
<< "Clustering encountered "
<< host_output.num_oversized_partitions
<< " oversized partitions; if this number is too large, "
"it may cause performance problems."
<< std::endl;
}
}

// Return the reconstructed measurements.
return measurements;
}
Expand Down
67 changes: 55 additions & 12 deletions device/sycl/src/clusterization/clusterization_algorithm.sycl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "../sanity/ordered_on.hpp"
#include "../utils/barrier.hpp"
#include "../utils/get_queue.hpp"
#include "traccc/clusterization/device/ccl_debug_output.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/sycl/clusterization/clusterization_algorithm.hpp"
#include "traccc/sycl/utils/thread_id.hpp"
Expand Down Expand Up @@ -113,9 +114,28 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
assert(m_config.max_cells_per_thread <=
device::details::CELLS_PER_THREAD_STACK_LIMIT);

// If necessary, allocate an object for storing the debug information
vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
cl::sycl::event evt_copy_debug_output_h2d;

if (m_config.enable_debug_output) {
debug_output =
vecmem::make_unique_alloc<device::details::ccl_debug_output>(
m_mr.main);

device::details::ccl_debug_output empty_output =
device::details::ccl_debug_output::init();

evt_copy_debug_output_h2d = details::get_queue(m_queue).memcpy(
debug_output.get(), &empty_output,
sizeof(device::details::ccl_debug_output));
}

// Run ccl kernel
details::get_queue(m_queue)
.submit([&](::sycl::handler& h) {
cl::sycl::event evt_run_kernel =
details::get_queue(m_queue).submit([&](::sycl::handler& h) {
h.depends_on(evt_copy_debug_output_h2d);

// Allocate shared memory for the kernel.
vecmem::sycl::local_accessor<std::size_t> shared_uint(3, h);
vecmem::sycl::local_accessor<device::details::index_t> shared_idx(
Expand All @@ -130,8 +150,8 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
gf_backup_view = vecmem::get_data(m_gf_backup),
adjc_backup_view = vecmem::get_data(m_adjc_backup),
adjv_backup_view = vecmem::get_data(m_adjv_backup),
mutex_ptr = m_backup_mutex.get(),
cfg = m_config](::sycl::nd_item<1> item) {
mutex_ptr = m_backup_mutex.get(), cfg = m_config,
debug_output = debug_output.get()](::sycl::nd_item<1> item) {
// Construct more readable variable names.
vecmem::data::vector_view<device::details::index_t> f_view{
static_cast<vector_size_t>(cfg.max_partition_size()),
Expand All @@ -152,15 +172,38 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
const sycl::thread_id1 thread_id(item);

// Run the algorithm for this thread.
device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
partition_start, partition_end, outi,
f_view, gf_view, f_backup_view,
gf_backup_view, adjc_backup_view,
adjv_backup_view, backup_mutex, barry_r,
measurements_view, cell_links_view);
device::ccl_kernel(
cfg, thread_id, cells_view, modules_view,
partition_start, partition_end, outi, f_view, gf_view,
f_backup_view, gf_backup_view, adjc_backup_view,
adjv_backup_view, backup_mutex, barry_r,
measurements_view, cell_links_view, debug_output);
});
})
.wait_and_throw();
});

cl::sycl::event evt_copy_debug_output_d2h;

if (debug_output) {
device::details::ccl_debug_output host_output;

evt_copy_debug_output_d2h = details::get_queue(m_queue).memcpy(
&host_output, debug_output.get(),
sizeof(device::details::ccl_debug_output), {evt_run_kernel});

evt_copy_debug_output_d2h.wait_and_throw();

if (host_output.num_oversized_partitions > 0) {
std::cout << "WARNING: @clusterization_algorithm: "
<< "Clustering encountered "
<< host_output.num_oversized_partitions
<< " oversized partitions; if this number is too large, "
"it may cause performance problems."
<< std::endl;
}
}

cl::sycl::event::wait_and_throw(
{evt_run_kernel, evt_copy_debug_output_d2h});

// Return the reconstructed measurements.
return measurements;
Expand Down
1 change: 1 addition & 0 deletions examples/options/include/traccc/options/clusterization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class clusterization
unsigned int max_cells_per_thread;
unsigned int target_cells_per_thread;
unsigned int backup_size_multiplier;
bool enable_debug_output;
/// @}

/// Print the specific options of this class
Expand Down
16 changes: 15 additions & 1 deletion examples/options/src/clusterization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@
// System include(s).
#include <iostream>

namespace {
#ifndef NDEBUG
constexpr bool enable_cca_debug_default = true;
#else
constexpr bool enable_cca_debug_default = false;
#endif
} // namespace

namespace traccc::opts {

clusterization::clusterization() : interface("Clusterization Options") {
Expand All @@ -33,6 +41,10 @@ clusterization::clusterization() : interface("Clusterization Options") {
boost::program_options::value(&backup_size_multiplier)
->default_value(256),
"The size multiplier of the backup scratch space");
m_desc.add_options()("cca-debug",
boost::program_options::value(&enable_debug_output)
->default_value(enable_cca_debug_default),
"The size multiplier of the backup scratch space");
}

clusterization::operator clustering_config() const {
Expand All @@ -42,6 +54,7 @@ clusterization::operator clustering_config() const {
rv.max_cells_per_thread = max_cells_per_thread;
rv.target_cells_per_thread = target_cells_per_thread;
rv.backup_size_multiplier = backup_size_multiplier;
rv.enable_debug_output = enable_debug_output;

return rv;
}
Expand All @@ -54,7 +67,8 @@ std::ostream& clusterization::print_impl(std::ostream& out) const {
out << " Threads per partition: " << threads_per_partition << "\n";
out << " Target cells per thread: " << target_cells_per_thread << "\n";
out << " Max cells per thread: " << max_cells_per_thread << "\n";
out << " Scratch space size mult.: " << backup_size_multiplier;
out << " Scratch space size mult.: " << backup_size_multiplier << "\n";
out << " Debug output printing: " << enable_debug_output << "\n";
return out;
}

Expand Down
2 changes: 2 additions & 0 deletions tests/common/tests/cca_test.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ inline traccc::clustering_config default_ccl_test_config() {
rv.max_cells_per_thread = 16;
rv.target_cells_per_thread = 8;
rv.backup_size_multiplier = 256;
rv.enable_debug_output = false;

return rv;
}
Expand All @@ -54,6 +55,7 @@ inline traccc::clustering_config tiny_ccl_test_config() {
rv.max_cells_per_thread = 1;
rv.target_cells_per_thread = 1;
rv.backup_size_multiplier = 16384;
rv.enable_debug_output = false;

return rv;
}
Expand Down
Loading