Skip to content

Commit

Permalink
Regards #3: Additional doxygen comments, corrections and completions …
Browse files Browse the repository at this point in the history
…of existing comments, doxygen markup tweaks, and spacing tweaks
  • Loading branch information
eyalroz authored and Eyal Rozenberg committed Apr 14, 2024
1 parent 7f8915f commit bd5cc96
Show file tree
Hide file tree
Showing 29 changed files with 1,411 additions and 387 deletions.
6 changes: 5 additions & 1 deletion src/cuda/api/array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,16 +270,20 @@ array_t<T, NumDimensions> wrap(
return { device_id, context_handle, handle, dimensions };
}

/// Create a new (typed) CUDA array of the specified dimensions
///@{
/// @param context ... in which the array is to be created
template <typename T, dimensionality_t NumDimensions>
array_t<T,NumDimensions> create(
const context_t& context,
dimensions_t<NumDimensions> dimensions);

/// @param device ... in whose primary context the array is to be created
template <typename T, dimensionality_t NumDimensions>
array_t<T,NumDimensions> create(
const device_t& device,
dimensions_t<NumDimensions> dimensions);

///@}

} // namespace array

Expand Down
4 changes: 3 additions & 1 deletion src/cuda/api/common_ptx_compilation_options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ enum class memory_operation_t { load, store };
/// A helper struct for templatizing caching<Op>::mode
template <memory_operation_t Op> struct caching;

/// Load operation caching settings
template <> struct caching<memory_operation_t::load> {

/// The combination of effects the execution of an instruction will have
Expand Down Expand Up @@ -91,6 +92,7 @@ template <> struct caching<memory_operation_t::load> {
static constexpr const char* mode_names[] = { "ca", "cg", "cs", "lu", "cv" };
};

/// Store operation caching settings
template <> struct caching<memory_operation_t::store> {

/// The combination of effects the execution of an instruction will have
Expand Down Expand Up @@ -193,7 +195,7 @@ struct common_ptx_compilation_options_t {
bool generate_relocatable_device_code { false };

// What about store caching?
};
}; // common_ptx_compilation_options_t

} // namespace rtc
} // namespace cuda
Expand Down
22 changes: 20 additions & 2 deletions src/cuda/api/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,15 @@ inline context::flags_t get_flags(handle_t handle)

} // namespace context

/**
* Waits for all previously-scheduled tasks on all streams (= queues)
* in a CUDA context to conclude, before returning.
*
* Depending on the `host_thread_sync_scheduling_policy_t` set for the
* specified context, the thread calling this method will either yield,
* spin or block until all tasks scheduled previously scheduled on streams
* within this context have concluded.
*/
inline void synchronize(const context_t& context);

/**
Expand Down Expand Up @@ -745,10 +754,18 @@ inline handle_t create_and_push(
/**
* @brief creates a new context on a given device
*
* @param device The device on which to create the new stream
* @param device
* The device which the new context will regard
* @param sync_scheduling_policy
* Choice of how host threads are to perform synchronization with pending
* actions in streams within this context. See
* @ref host_thread_sync_scheduling_policy_t for a description of these
* choices.
* @param keep_larger_local_mem_after_resize
* @return
* If true, larger allocations of global device memory, used by kernels
* requiring a larger amount of local memory, will be kept (so that future
* kernels with such requirements will not trigger a re-allocation).
*
* @note Until CUDA 11, there used to also be a flag for enabling/disabling
* the ability of mapping pinned host memory to device addresses. However, it was
* being ignored since CUDA 3.2 already, with the minimum CUDA version supported
Expand Down Expand Up @@ -861,6 +878,7 @@ inline context_t get_with_fallback_push()

} // namespace current

/// @return true if the context is the primary context of its device
bool is_primary(const context_t& context);

namespace detail_ {
Expand Down
1 change: 0 additions & 1 deletion src/cuda/api/detail/unique_span.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ namespace cuda {
*
* @tparam T the type of individual elements in the unique_span
*/

template<typename T, typename Deleter = ::std::default_delete<T[]>>
class unique_span : public ::cuda::span<T> {
public: // span types
Expand Down
13 changes: 11 additions & 2 deletions src/cuda/api/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class pool_t;
* @brief Waits for all previously-scheduled tasks on all streams (= queues)
* on a specified device to conclude.
*
* Depending on the host_thread_sync_scheduling_policy_t set for this
* Depending on the host_thread_sync_scheduling_policy_t set for the specified
* device, the thread calling this method will either yield, spin or block
* until all tasks scheduled previously scheduled on this device have been
* concluded.
Expand All @@ -49,7 +49,7 @@ namespace device {

///@cond
class primary_context_t;
///@cendond
///@endcond

using limit_t = context::limit_t;
using limit_value_t = context::limit_value_t;
Expand Down Expand Up @@ -604,18 +604,27 @@ class device_t {
set_flags(other_flags | static_cast<flags_type>(new_policy));
}

/// @returns true if the device will keep larger amounts of global device memory allocated
/// for use as local memory, after a kernel was executed which required a larger-than-usual
/// allocation
bool keeping_larger_local_mem_after_resize() const
{
return flags() & CU_CTX_LMEM_RESIZE_TO_MAX;
}

/// @brief Instructs the (primary context of) the device to keep larger amounts of global
/// device memory allocated for use as local memory, after a kernel was executed which
/// required a larger-than-usual allocation
void keep_larger_local_mem_after_resize(bool keep = true)
{
auto other_flags = flags() & ~CU_CTX_LMEM_RESIZE_TO_MAX;
flags_type new_flags = other_flags | (keep ? CU_CTX_LMEM_RESIZE_TO_MAX : 0);
set_flags(new_flags);
}

/// @brief Instructs the (primary context of) the device to discard allocations of larger
/// amounts of global device memory which were used by a kernel requiring a larger amount
/// of local memory, and has concluded execution.
void dont_keep_larger_local_mem_after_resize()
{
keep_larger_local_mem_after_resize(false);
Expand Down
105 changes: 103 additions & 2 deletions src/cuda/api/launch_config_builder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ class launch_config_builder_t {
}
}

/// Use the information specified for the builder to figure out the grid and block
/// dimensions with which the kernel is to be launched
grid::composite_dimensions_t get_composite_dimensions() const noexcept(false)
{
auto result = get_unvalidated_composite_dimensions();
Expand All @@ -189,6 +191,10 @@ class launch_config_builder_t {
}

public:
/// Use the information specified to the builder (and defaults for the unspecified
/// information) to finalize the construction of a kernel launch configuration,
/// which can then be passed along with the kernel to a kernel-launching function,
/// e.g. the standalone @ref kernel::launch or the stream command @ref stream_t::enqueue_t::kernel_launch
launch_configuration_t build() const
{
auto result = launch_configuration_t{ get_composite_dimensions() };
Expand Down Expand Up @@ -392,6 +398,7 @@ class launch_config_builder_t {

}

/// Set the dimensions for each block in the intended kernel launch grid
launch_config_builder_t& block_dimensions(
grid::block_dimension_t x,
grid::block_dimension_t y = 1,
Expand All @@ -400,8 +407,17 @@ class launch_config_builder_t {
return block_dimensions(grid::block_dimensions_t{x, y, z});
}

/// Set the block in the intended kernel launch grid to be uni-dimensional
/// with a specified size
launch_config_builder_t& block_size(grid::block_dimension_t size) { return block_dimensions(size, 1, 1); }

/**
* Set the intended kernel launch grid to have 1D blocks, of the maximum
* length possible given the information specified to the builder.
*
* @note This will fail if neither a kernel nor a device have been chosen
* for the launch.
*/
launch_config_builder_t& use_maximum_linear_block()
{
grid::block_dimension_t max_size;
Expand All @@ -424,6 +440,16 @@ class launch_config_builder_t {
}

#if CUDA_VERSION >= 12000
/**
* Set the dimensions of multi-block clusters within the grid.
*
* @note There is only a small number of possible dimension combinations of clusters;
* and this function does _not_ guarantee to fail immediately if you specify an
* invalid such combination.
*
* @note This setting does not affect the overall dimensions of the grid in terms of
* blocks.
*/
launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims)
{
#ifndef NDEBUG
Expand All @@ -434,6 +460,9 @@ class launch_config_builder_t {
}
#endif

/// Set the dimension of the grid for the intended kernel launch, in terms
/// of blocks
///@{
launch_config_builder_t& grid_dimensions(grid::dimensions_t dims)
{
#ifndef NDEBUG
Expand All @@ -447,6 +476,7 @@ class launch_config_builder_t {
return *this;
}

///@}
launch_config_builder_t& grid_dimensions(
grid::dimension_t x,
grid::dimension_t y = 1,
Expand All @@ -455,9 +485,17 @@ class launch_config_builder_t {
return grid_dimensions(grid::dimensions_t{x, y, z});
}

/// Set the grid for the intended launch to be one-dimensional, with a specified number
/// of blocks
///@{
launch_config_builder_t& grid_size(grid::dimension_t size) {return grid_dimensions(size, 1, 1); }
launch_config_builder_t& num_blocks(grid::dimension_t size) {return grid_size(size); }
///@}


/// Set the overall number of _threads_, in each dimension, of all blocks
/// in the grid of the intended kernel launch
///@{
launch_config_builder_t& overall_dimensions(grid::overall_dimensions_t dims)
{
#ifndef NDEBUG
Expand All @@ -474,16 +512,30 @@ class launch_config_builder_t {
{
return overall_dimensions(grid::overall_dimensions_t{x, y, z});
}
///@}

/// Set the intended launch grid to be linear, with a specified overall number of _threads_
/// over all (1D) blocks in the grid
launch_config_builder_t& overall_size(grid::overall_dimension_t size) { return overall_dimensions(size, 1, 1); }

/**
* Set whether or blocks may synchronize with each other or not
*
* @note recall that even "non-cooperative" blocks can still access the same global memory
* locations, and can use atomic operations on such locations for (slow) synchronization.
*/
launch_config_builder_t& block_cooperation(bool cooperation)
{
thread_block_cooperation = cooperation;
return *this;
}

/// Let kernel thread blocks synchronize with each other, or are guaranteed to act independently
/// (atomic global memory operations notwithstanding)
launch_config_builder_t& blocks_may_cooperate() { return block_cooperation(true); }

/// Prevent kernel thread blocks synchronize with each other, guaranteeing each block will
/// work entirely independently (atomic global memory operations notwithstanding)
launch_config_builder_t& blocks_dont_cooperate() { return block_cooperation(false); }

launch_config_builder_t& dynamic_shared_memory_size(
Expand All @@ -493,11 +545,18 @@ class launch_config_builder_t {
return *this;
}

/// Indicate that the intended launch should not allocate any shared
/// memory for the kernel to use beyond the static amount necessitated
/// by its (compiled) code.
launch_config_builder_t& no_dynamic_shared_memory()
{
return dynamic_shared_memory_size(memory::shared::size_t(0));
}

/// Indicate that the intended launch should allocate a certain amount of shared
/// memory for the kernel to use beyond the static amount necessitated
/// by its (compiled) code.
///@{
launch_config_builder_t& dynamic_shared_memory_size(memory::shared::size_t size)
{
#ifndef NDEBUG
Expand All @@ -512,13 +571,32 @@ class launch_config_builder_t {
{
return dynamic_shared_memory_size(size);
}
///@}

/**
* Indicate that the intended launch should allocate additional shared
* memory for the kernel to use beyond the static amount necessitated
* by its (compiled) code - with the amount to be determined based on
* the block size
*
* @param shared_mem_size_determiner a function determining the dynamic
* shared memory size given the kernel launch block size
*/
launch_config_builder_t& dynamic_shared_memory(
kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
{
return dynamic_shared_memory_size(shared_mem_size_determiner);
}

/**
* Indicate that the specified wrapped kernel will be the one launched
* with the configuration to be produced by this object. Such an indication
* provides this object with information about the device and context in
* which the kernel is to be launched, and ranges of possible values for
* certain parameters (e.g. shared memory size, dimensions).
*
* @note Calling this method obviates a call to the @ref device() method.
*/
launch_config_builder_t& kernel(const kernel_t* wrapped_kernel_ptr)
{
if (device_ and kernel_->device_id() != device_.value()) {
Expand All @@ -533,6 +611,15 @@ class launch_config_builder_t {
return *this;
}

/**
* Indicate that the intended kernel launch would occur on (some stream in
* some context on) the specified device. Such an indication provides this
* object with some information regarding ranges of possible values for
* certain parameters (e.g. shared memory size, dimensions).
*
* @note Do not call both this and the @ref kernel() method; prefer just that one.
*/
///@{
launch_config_builder_t& device(const device::id_t device_id)
{
if (kernel_ and kernel_->device_id() != device_id) {
Expand All @@ -548,7 +635,11 @@ class launch_config_builder_t {
{
return this->device(device.id());
}
///@}

/// Clear the association with a specific kernel (which may have been
/// set using the @ref kernel method)
///@{
launch_config_builder_t& kernel_independent()
{
kernel_ = nullptr;
Expand All @@ -559,13 +650,14 @@ class launch_config_builder_t {
kernel_ = nullptr;
return *this;
}
///@}

/**
* @brief THis will use information about the kernel, the already-set block size,
* @brief This will use information about the kernel, the already-set block size,
* and the device to create a unidimensional grid of blocks to exactly saturate
* the CUDA device's capacity for simultaneous active blocks.
*
* @note This will _not_ set the block size - unlike
* @note This will _not_ set the block size - unlike {@ref min_params_for_max_occupancy()}.
*/
launch_config_builder_t& saturate_with_active_blocks()
{
Expand All @@ -584,6 +676,14 @@ class launch_config_builder_t {
return *this;
}

/**
* @brief This will use information about the kernel and the device to define
* a minimum launch grid which should guarantee maximum occupancy of the GPU's
* multiprocessors.
*
* @note A builder after this call _will_ set the block dimensions - unlike
* {@ref saturate_with_active_blocks()} .
*/
launch_config_builder_t& min_params_for_max_occupancy()
{
if (not (kernel_)) {
Expand All @@ -600,6 +700,7 @@ class launch_config_builder_t {
}
}; // launch_config_builder_t

/// A slightly shorter-named construction idiom for @ref launch_config_builder_t
inline launch_config_builder_t launch_config_builder() { return {}; }

} // namespace cuda
Expand Down
Loading

0 comments on commit bd5cc96

Please sign in to comment.