From 410c7d06e9c7b2612aa9b5ef1257bbec73c01705 Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Mon, 5 Feb 2024 20:53:55 +0200 Subject: [PATCH] Fixes #582, fixes #564: * launch config <-> device validation now checks for block cooperation support when that's requested * Refactored and re-located some of the launch config validation code * Added: `device_t` method for checking block cooperation support * Now properly validating grid dimensions to ensure we don't exceed the maxima * Made sure the code paths inwards from the non-detail_ launching functions to the actual CUDA API calls all have appropriate validation calls * Comment and spacing tweaks --- src/cuda/api/device.hpp | 11 ++ src/cuda/api/kernel_launch.hpp | 24 +-- src/cuda/api/launch_config_builder.hpp | 46 ++++- src/cuda/api/launch_configuration.hpp | 12 +- .../api/multi_wrapper_impls/kernel_launch.hpp | 182 ++++++++++++++---- .../launch_configuration.hpp | 30 ++- src/cuda/api/types.hpp | 8 + 7 files changed, 228 insertions(+), 85 deletions(-) diff --git a/src/cuda/api/device.hpp b/src/cuda/api/device.hpp index e7469d66..36967143 100644 --- a/src/cuda/api/device.hpp +++ b/src/cuda/api/device.hpp @@ -357,6 +357,17 @@ class device_t { return get_attribute(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH); } +#if CUDA_VERSION >= 12000 + /** + * True if this device supports "clusters" of grid blocks, + * which can pool their shared memory together + */ + bool supports_block_clustering() const + { + return get_attribute(CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH); + } +#endif + #if CUDA_VERSION >= 11020 /** * True if this device supports executing kernels in which blocks can diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp index 51657d19..6fe2dfe8 100644 --- a/src/cuda/api/kernel_launch.hpp +++ b/src/cuda/api/kernel_launch.hpp @@ -178,9 +178,6 @@ void enqueue_raw_kernel_launch_in_current_context( using decayed_kf_type = typename ::std::decay::type; static_assert(::std::is_function::value or is_function_ptr::value, "Only a bona fide function can be launched as a CUDA kernel"); -#ifndef NDEBUG - validate(launch_configuration); -#endif if (not launch_configuration.has_nondefault_attributes()) { // regular plain vanilla launch kernel_function <<< @@ -323,26 +320,7 @@ void enqueue_launch( Kernel&& kernel, const stream_t& stream, launch_configuration_t launch_configuration, - KernelParameters&&... parameters) -{ - static_assert( - detail_::all_true<::std::is_trivially_copy_constructible>::value...>::value, - "All kernel parameter types must be of a trivially copy-constructible (decayed) type." ); - static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of::type>::value; -#if CUDA_VERSION >= 12000 - static constexpr const bool library_kernel = cuda::detail_::is_library_kernel::value; -#else - static constexpr const bool library_kernel = false; -#endif // CUDA_VERSION >= 12000 - // We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to - // use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch - // function for each of them will basically be just a one-liner :-( - detail_::enqueue_launch( - detail_::bool_constant{}, - detail_::bool_constant{}, - ::std::forward(kernel), stream, launch_configuration, - ::std::forward(parameters)...); -} + KernelParameters&&... parameters); /** * Variant of @ref enqueue_launch for use with the default stream in the current context. diff --git a/src/cuda/api/launch_config_builder.hpp b/src/cuda/api/launch_config_builder.hpp index 471541ee..a1b960ad 100644 --- a/src/cuda/api/launch_config_builder.hpp +++ b/src/cuda/api/launch_config_builder.hpp @@ -1,9 +1,9 @@ /** * @file * - * @brief Contains the @ref launch + * @brief Contains the @ref `cuda::launch_config_builder_t` class definition * - * @note Launch configurations are used mostly in @ref kernel_launch.hpp . + * @note Launch configurations are used mostly in @ref `kernel_launch.hpp`. */ #pragma once @@ -178,6 +178,7 @@ class launch_config_builder_t { struct { optional block; + optional block_cluster; optional grid; optional overall; } dimensions_; @@ -188,7 +189,7 @@ class launch_config_builder_t { // but the semantic is that if the determiner is not null, we use it; // and if you want to force a concrete apriori value, then you nullify // the determiner - kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ {nullptr }; + kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ { nullptr }; memory::shared::size_t dynamic_shared_memory_size_ { 0 }; const kernel_t* kernel_ { nullptr }; @@ -224,7 +225,7 @@ class launch_config_builder_t { memory::shared::size_t shared_mem_size) { if (kernel_ptr == nullptr) { return; } - detail_::validate_compatibility(*kernel_ptr, shared_mem_size); + detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size); } static void validate_compatibility( @@ -232,7 +233,7 @@ class launch_config_builder_t { memory::shared::size_t shared_mem_size) { if (not maybe_device_id) { return; } - detail_::validate_compatibility(device(maybe_device_id), shared_mem_size); + detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size); } void validate_dynamic_shared_memory_size(memory::shared::size_t size) @@ -269,6 +270,15 @@ class launch_config_builder_t { validate_block_dimension_compatibility(device_, block_dims); } + + static void validate_grid_dimension_compatibility( + optional maybe_device_id, + grid::block_dimensions_t block_dims) + { + if (not maybe_device_id) { return; } + detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims); + } + void validate_grid_dimensions(grid::dimensions_t grid_dims) const { detail_::validate_grid_dimensions(grid_dims); @@ -279,6 +289,16 @@ class launch_config_builder_t { // TODO: Check divisibility } +#if CUDA_VERSION >= 12000 + void validate_cluster_dimensions(grid::dimensions_t cluster_dims) const + { + if (dimensions_.grid and grid::dimensions_t::divides(cluster_dims, dimensions_.grid.value())) { + throw ::std::runtime_error("The requested block cluster dimensions do not " + "divide the grid dimensions (in blocks)"); + } + } +#endif // CUDA_VERSION >= 12000 + void validate_overall_dimensions(grid::overall_dimensions_t overall_dims) const { if (dimensions_.block and dimensions_.grid) { @@ -309,7 +329,8 @@ class launch_config_builder_t { get_composite_dimensions().block; validate_block_dimension_compatibility(device_id, block_dims); } - validate_compatibility(device_id, dynamic_shared_memory_size_); + detail_::validate_compatibility( + device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster); } void validate_composite_dimensions(grid::composite_dimensions_t composite_dims) const @@ -318,7 +339,7 @@ class launch_config_builder_t { validate_block_dimension_compatibility(device_, composite_dims.block); // Is there anything to validate regarding the grid dims? - validate_block_dimension_compatibility(device_, composite_dims.grid); + validate_grid_dimension_compatibility(device_, composite_dims.grid); } #endif // ifndef NDEBUG @@ -378,6 +399,17 @@ class launch_config_builder_t { return *this; } +#if CUDA_VERSION >= 12000 + launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims) + { +#ifndef NDEBUG + validate_cluster_dimensions(cluster_dims); +#endif + dimensions_.block_cluster = cluster_dims; + return *this; + } +#endif + launch_config_builder_t& grid_dimensions(grid::dimensions_t dims) { #ifndef NDEBUG diff --git a/src/cuda/api/launch_configuration.hpp b/src/cuda/api/launch_configuration.hpp index 17552bc8..455d4704 100644 --- a/src/cuda/api/launch_configuration.hpp +++ b/src/cuda/api/launch_configuration.hpp @@ -207,7 +207,7 @@ namespace detail_ { // Note: This will not check anything related to the device or the kernel // with which the launch configuration is to be used -inline void validate(launch_configuration_t launch_config) noexcept(false) +inline void validate(const launch_configuration_t& launch_config) noexcept(false) { validate_block_dimensions(launch_config.dimensions.block); validate_grid_dimensions(launch_config.dimensions.grid); @@ -223,15 +223,9 @@ inline void validate_compatibility( // validate_grid_dimension_compatibility(device, launch_config.dimensions.grid); } -inline void validate_compatibility( +void validate_compatibility( const kernel_t& kernel, - launch_configuration_t launch_config) noexcept(false) -{ - validate(launch_config); - validate_block_dimension_compatibility(kernel, launch_config.dimensions.block); - // Uncomment if we actually get such checks - // validate_grid_dimension_compatibility(kernel, launch_config.dimensions.grid); -} + launch_configuration_t launch_config) noexcept(false); using launch_attribute_index_t = unsigned int; diff --git a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp index cb00eb8f..0d34805f 100644 --- a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp +++ b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp @@ -21,9 +21,41 @@ namespace cuda { +template +void enqueue_launch( + Kernel&& kernel, + const stream_t& stream, + launch_configuration_t launch_configuration, + KernelParameters&&... parameters) +{ + static_assert( + detail_::all_true<::std::is_trivially_copy_constructible>::value...>::value, + "All kernel parameter types must be of a trivially copy-constructible (decayed) type." ); + static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of::type>::value; +#if CUDA_VERSION >= 12000 + static constexpr const bool library_kernel = cuda::detail_::is_library_kernel::value; +#else + static constexpr const bool library_kernel = false; +#endif // CUDA_VERSION >= 12000 +#ifndef NDEBUG + // wrapped kernel and library kernel compatibility with the launch configuration + // will be validated further inside, when we differentiate them from raw kernels + detail_::validate(launch_configuration); +#endif + + // We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to + // use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch + // function for each of them will basically be just a one-liner :-( + detail_::enqueue_launch( + detail_::bool_constant{}, + detail_::bool_constant{}, + ::std::forward(kernel), stream, launch_configuration, + ::std::forward(parameters)...); +} + namespace detail_ { -inline void validate_compatibility( +inline void validate_shared_mem_compatibility( const device_t &device, memory::shared::size_t shared_mem_size) { @@ -40,6 +72,61 @@ inline void validate_compatibility( } } +inline void validate_compatibility( + const device::id_t device_id, + memory::shared::size_t shared_mem_size, + bool cooperative_launch, + optional block_cluster_dimensions) +{ + auto device = device::get(device_id); + if (not cooperative_launch or device.supports_block_cooperation()) { + throw ::std::runtime_error(device::detail_::identify(device_id) + + " cannot launch kernels with inter-block cooperation"); + } + validate_shared_mem_compatibility(device, shared_mem_size); + if (block_cluster_dimensions) { +#if CUDA_VERSION >= 12000 + if (not device.supports_block_clustering()) { + throw ::std::runtime_error(device::detail_::identify(device_id) + + " cannot launch kernels with inter-block cooperation"); + // TODO: Uncomment this once the CUDA driver offers info on the maximum + // cluster size... + // + // auto max_cluster_size = ???; + // auto cluster_size = block_cluster_dimensions.value().volume(); + // if (cluster_size > max_cluster_size) { + // throw ::std::runtime_error(device::detail_::identify(device_id) + // + " only supports as many as " + ::std::to_string(max_cluster_size) + // + "blocks per block-cluster, but " + ::std::to_string(cluster_size)); + } +#else + throw ::std::runtime_error("Block clusters are not supported with CUDA versions earlier than 12.0"); +#endif // CUDA_VERSION >= 12000 + } + + // The CUDA driver does not offer us information with which we could check the validity + // of trying a programmatically dependent launch, or a programmatic completion event, + // or the use of a "remote" memory synchronization domain. So, assuming that's all valid +} + +template +inline void validate_any_dimensions_compatibility(const device_t &device, Dims dims, Dims maxima, const char* kind) +{ + auto device_id = device.id(); + auto check = + [device_id, kind](grid::dimension_t dim, grid::dimension_t max, const char *axis) { + if (max < dim) { + throw ::std::invalid_argument( + ::std::string("specified ") + kind + " " + axis + "-axis dimension " + ::std::to_string(dim) + + " exceeds the maximum supported " + axis + " dimension of " + ::std::to_string(max) + + " for " + device::detail_::identify(device_id)); + } + }; + check(dims.x, maxima.x, "X"); + check(dims.y, maxima.y, "Y"); + check(dims.z, maxima.z, "Z"); +} + inline void validate_block_dimension_compatibility( const device_t &device, grid::block_dimensions_t block_dims) @@ -52,27 +139,28 @@ inline void validate_block_dimension_compatibility( + ", exceeding the maximum possible block size of " + ::std::to_string(max_block_size) + " for " + device::detail_::identify(device.id())); } - auto dim_maxima = grid::block_dimensions_t{ + auto maxima = grid::block_dimensions_t{ static_cast(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)), static_cast(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)), static_cast(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)) }; - auto device_id = device.id(); - auto check = - [device_id](grid::block_dimension_t dim, grid::block_dimension_t max, const char *axis) { - if (max < dim) { - throw ::std::invalid_argument( - ::std::string("specified block ") + axis + "-axis dimension " + ::std::to_string(dim) - + " exceeds the maximum supported " + axis + " dimension of " + ::std::to_string(max) - + " for " + device::detail_::identify(device_id)); - } - }; - check(block_dims.x, dim_maxima.x, "X"); - check(block_dims.y, dim_maxima.y, "Y"); - check(block_dims.z, dim_maxima.z, "Z"); + validate_any_dimensions_compatibility(device, block_dims, maxima, "block"); } -inline void validate_compatibility( +inline void validate_grid_dimension_compatibility( + const device_t &device, + grid::block_dimensions_t block_dims) +{ + auto maxima = grid::dimensions_t{ + static_cast(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)), + static_cast(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)), + static_cast(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z)) + }; + validate_any_dimensions_compatibility(device, block_dims, maxima, "grid"); +} + + +inline void validate_shared_mem_size_compatibility( const kernel_t& kernel_ptr, memory::shared::size_t shared_mem_size) { @@ -118,11 +206,7 @@ void enqueue_launch_helper::ope // `KernelParameter` pack may contain some references, arrays and so on - which CUDA // kernels cannot accept; so we massage those a bit. -#ifndef NDEBUG - validate_compatibility(stream.device(), launch_configuration); - validate_compatibility(wrapped_kernel, launch_configuration); - // validating the configuration unto-itself should happen within the following function: -#endif + // It is assumed arguments were already been validated detail_::enqueue_raw_kernel_launch_in_current_context( unwrapped_kernel_function, @@ -140,7 +224,6 @@ marshal_dynamic_kernel_arguments(KernelParameters&&... parameters) return ::std::array { ¶meters... }; } - // Note: The last (valid) element of marshalled_arguments must be null inline void enqueue_kernel_launch_by_handle_in_current_context( kernel::handle_t kernel_function_handle, @@ -150,9 +233,8 @@ inline void enqueue_kernel_launch_by_handle_in_current_context( launch_configuration_t launch_config, const void** marshalled_arguments) { -#ifndef NDEBUG - validate(launch_config); -#endif + // It is assumed arguments were already been validated + status_t status; const auto&lc = launch_config; // alias for brevity #if CUDA_VERSION >= 12000 @@ -198,25 +280,30 @@ inline void enqueue_kernel_launch_by_handle_in_current_context( template struct enqueue_launch_helper { + void operator()( const kernel_t& wrapped_kernel, - const stream_t & stream, + const stream_t& stream, launch_configuration_t launch_config, - KernelParameters &&... arguments) const + KernelParameters&&... arguments) const { - auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward(arguments)...) }; - auto function_handle = wrapped_kernel.handle(); - CAW_SET_SCOPE_CONTEXT(stream.context_handle()); + // It is assumed arguments were already been validated + #ifndef NDEBUG - validate_compatibility(stream.device(), launch_config); + if (wrapped_kernel.context() != stream.context()) { + throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(wrapped_kernel) + + " on " + stream::detail_::identify(stream) + ": Different contexts"}; + } validate_compatibility(wrapped_kernel, launch_config); #endif + auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward(arguments)...) }; + auto function_handle = wrapped_kernel.handle(); + CAW_SET_SCOPE_CONTEXT(stream.context_handle()); enqueue_kernel_launch_by_handle_in_current_context( function_handle, stream.device_id(), stream.context_handle(), stream.handle(), launch_config, marshalled_arguments.data()); - } - + } // operator() }; template @@ -228,14 +315,12 @@ void enqueue_launch( launch_configuration_t launch_configuration, KernelParameters&&... parameters) { + // It is assumed arguments were already been validated + // Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to, // and not have trouble enqueueing into a stream in another context - it balks at doing so under // certain conditions, so we must place ourselves in the stream's context. CAW_SET_SCOPE_CONTEXT(stream.context_handle()); -#ifndef NDEBUG - validate_compatibility(stream.device(), launch_configuration); - // validating the configuration without the device should happen within the next function... -#endif detail_::enqueue_raw_kernel_launch_in_current_context( kernel_function, stream.device_id(), stream.context_handle(), stream.handle(), launch_configuration, ::std::forward(parameters)...); @@ -250,6 +335,15 @@ void enqueue_launch( launch_configuration_t launch_configuration, KernelParameters&&... parameters) { + // It is assumed arguments were already been validated - except for: +#ifndef NDEBUG + if (kernel.context() != stream.context()) { + throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(kernel) + + " on " + stream::detail_::identify(stream) + ": Different contexts"}; + } + detail_::validate_compatibility(kernel, launch_configuration); +#endif // #ifndef NDEBUG + enqueue_launch_helper::type, KernelParameters...>{}( ::std::forward(kernel), stream, launch_configuration, ::std::forward(parameters)...); @@ -265,6 +359,10 @@ void enqueue_launch( launch_configuration_t launch_configuration, KernelParameters&&... parameters) { + // Launch configuration is assumed to have been validated separately + // from the kernel, and their compatibility will be validated further + // inside, against the contextualized kernel + kernel_t contextualized = cuda::contextualize(kernel, stream.context()); enqueue_launch_helper {}( contextualized, stream, launch_configuration, @@ -280,6 +378,8 @@ inline void launch( launch_configuration_t launch_configuration, KernelParameters&&... parameters) { + // Argument validation will occur within call to enqueue_launch + auto primary_context = detail_::get_implicit_primary_context(::std::forward(kernel)); auto stream = primary_context.default_stream(); @@ -296,11 +396,18 @@ inline void launch_type_erased( launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments) { + // Note: We assume that kernel, stream and launch_configuration have already been validated. static_assert( ::std::is_same::value or ::std::is_same::value, "The element type of the marshalled arguments container type must be either void* or const void*"); #ifndef NDEBUG + if (kernel.context() != stream.context()) { + throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(kernel) + + " on " + stream::detail_::identify(stream) + ": Different contexts"}; + } + detail_::validate_compatibility(kernel, launch_configuration); + detail_::validate(launch_configuration); if (*(marshalled_arguments.end() - 1) != nullptr) { throw ::std::invalid_argument("marshalled arguments for a kernel launch must end with a nullptr element"); } @@ -323,6 +430,7 @@ void launch_type_erased( launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments) { + // Argument validation will occur inside the call to launch_type_erased auto contextualized = contextualize(kernel, stream.context()); launch_type_erased(contextualized, stream, launch_configuration, marshalled_arguments); } diff --git a/src/cuda/api/multi_wrapper_impls/launch_configuration.hpp b/src/cuda/api/multi_wrapper_impls/launch_configuration.hpp index b4b4c3bd..670c4ead 100644 --- a/src/cuda/api/multi_wrapper_impls/launch_configuration.hpp +++ b/src/cuda/api/multi_wrapper_impls/launch_configuration.hpp @@ -9,45 +9,57 @@ #define CUDA_API_WRAPPERS_MULTI_WRAPPERS_LAUNCH_CONFIGURATION_HPP #include "../launch_configuration.hpp" +#include "../kernel.hpp" namespace cuda { namespace detail_ { +inline void validate_compatibility( + const kernel_t& kernel, + launch_configuration_t launch_config) noexcept(false) +{ + validate(launch_config); + validate_block_dimension_compatibility(kernel, launch_config.dimensions.block); + // Uncomment if we actually get such checks + // validate_grid_dimension_compatibility(kernel, launch_config.dimensions.grid); + validate_compatibility(kernel.device(), launch_config); +} + #if CUDA_VERSION >= 12000 inline CUlaunchConfig marshal( const launch_configuration_t &config, const stream::handle_t stream_handle, span attribute_storage) noexcept(true) { - unsigned int attribute_index = 0; + unsigned int num_attributes = 0; // TODO: What about CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW ? if (config.block_cooperation) { - auto &attr_value = attribute_storage[attribute_index++]; + auto &attr_value = attribute_storage[num_attributes++]; attr_value.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; attr_value.value.cooperative = 1; } if (grid::dimensions_t::point() != config.clustering.cluster_dimensions) { - auto &attr_value = attribute_storage[attribute_index++]; + auto &attr_value = attribute_storage[num_attributes++]; attr_value.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; attr_value.value.clusterDim.x = config.clustering.cluster_dimensions.x; attr_value.value.clusterDim.y = config.clustering.cluster_dimensions.y; attr_value.value.clusterDim.z = config.clustering.cluster_dimensions.z; } if (config.clustering.scheduling_policy != cluster_scheduling_policy_t::default_) { - auto &attribute = attribute_storage[attribute_index++]; + auto &attribute = attribute_storage[num_attributes++]; attribute.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; attribute.value.clusterSchedulingPolicyPreference = static_cast(config.clustering.scheduling_policy); } // TODO: CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE if (config.programmatically_dependent_launch) { - auto &attr_value = attribute_storage[attribute_index++]; + auto &attr_value = attribute_storage[num_attributes++]; attr_value.id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION; attr_value.value.programmaticStreamSerializationAllowed = 1; } if (config.programmatic_completion.event) { - auto &attr_value = attribute_storage[attribute_index++]; + auto &attr_value = attribute_storage[num_attributes++]; attr_value.id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT; attr_value.value.programmaticEvent.event = config.programmatic_completion.event->handle(); // TODO: What about the flags? @@ -56,12 +68,12 @@ inline CUlaunchConfig marshal( } // What about CU_LAUNCH_ATTRIBUTE_PRIORITY ? if (config.in_remote_memory_synchronization_domain) { - auto &attr_value = attribute_storage[attribute_index++]; + auto &attr_value = attribute_storage[num_attributes++]; attr_value.id = CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN; attr_value.value.memSyncDomain = CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE; } // What about CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP ? - attribute_storage[attribute_index] = { CU_LAUNCH_ATTRIBUTE_IGNORE, {}}; + attribute_storage[num_attributes] = {CU_LAUNCH_ATTRIBUTE_IGNORE, {}}; return { config.dimensions.grid.x, @@ -73,7 +85,7 @@ inline CUlaunchConfig marshal( config.dynamic_shared_memory_size, stream_handle, attribute_storage.data(), - attribute_index + num_attributes }; } #endif // CUDA_VERSION >= 12000 diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp index 5c32ad36..5156776d 100644 --- a/src/cuda/api/types.hpp +++ b/src/cuda/api/types.hpp @@ -428,6 +428,14 @@ struct dimensions_t // this almost-inherits dim3 static constexpr __host__ __device__ dimensions_t square(dimension_t x) { return dimensions_t{ x, x, 1 }; } static constexpr __host__ __device__ dimensions_t line(dimension_t x) { return dimensions_t{ x, 1, 1 }; } static constexpr __host__ __device__ dimensions_t point() { return dimensions_t{ 1, 1, 1 }; } + + static bool divides(dimensions_t lhs, dimensions_t rhs) + { + return + (rhs.x % lhs.x == 0) and + (rhs.y % lhs.y == 0) and + (rhs.z % lhs.z == 0); + } }; ///@cond