From d350e1e59af4b7197d5452674ff690001b9988e8 Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Sun, 20 Oct 2024 21:43:49 +0300 Subject: [PATCH] Fixes #689: All remaining `async` subnamespace operations - `typed_set()`, `set()` and zero()` - now out of the namespace, and unified with their non-async variants --- src/cuda/api/memory.hpp | 144 +++++++++----------- src/cuda/api/multi_wrapper_impls/memory.hpp | 31 +++-- src/cuda/api/stream.hpp | 4 +- 3 files changed, 85 insertions(+), 94 deletions(-) diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index ec906af9..8f60e655 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -373,9 +373,10 @@ struct deleter { * @param start The first location to set to @p value ; must be properly aligned. * @param value A (properly aligned) value to set T-elements to. * @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes). + * @param stream A stream on which to schedule this action; may be omitted. */ template -void typed_set(T* start, const T& value, size_t num_elements); +void typed_set(T* start, const T& value, size_t num_elements, optional_ref stream = {}); /** * Sets all bytes in a region of memory to a fixed value @@ -386,10 +387,26 @@ void typed_set(T* start, const T& value, size_t num_elements); * @param start starting address of the memory region to set, in a CUDA * device's global memory * @param num_bytes size of the memory region in bytes + * @param stream an stream on which to schedule the operation; may be omitted + * +** + * Asynchronously sets all bytes in a stretch of memory to a single value + * + * @note asynchronous version of @ref memory::set(void*, int, size_t) + * + * @param start starting address of the memory region to set, + * in a CUDA device's global memory + * @param byte_value value to set the memory region to + * @param num_bytes size of the memory region in bytes + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void set(void* start, int byte_value, size_t num_bytes) +inline void set(void* start, int byte_value, size_t num_bytes, optional_ref stream = {}) { - return typed_set(static_cast(start), static_cast(byte_value), num_bytes); + return typed_set( + static_cast(start), + static_cast(byte_value), + num_bytes, + stream); } /** @@ -399,10 +416,11 @@ inline void set(void* start, int byte_value, size_t num_bytes) * * @param byte_value value to set the memory region to * @param region a region to zero-out, in a CUDA device's global memory + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void set(region_t region, int byte_value) +inline void set(region_t region, int byte_value, optional_ref stream = {}) { - set(region.start(), byte_value, region.size()); + set(region.start(), byte_value, region.size(), stream); } /** @@ -411,10 +429,11 @@ inline void set(region_t region, int byte_value) * @param start the beginning of a region of memory to zero-out, accessible * within a CUDA device's global memory * @param num_bytes the size in bytes of the region of memory to zero-out + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void zero(void* start, size_t num_bytes) +inline void zero(void* start, size_t num_bytes, optional_ref stream = {}) { - set(start, 0, num_bytes); + set(start, 0, num_bytes, stream); } /** @@ -422,23 +441,24 @@ inline void zero(void* start, size_t num_bytes) * * @param region the memory region to zero-out, accessible as a part of a * CUDA device's global memory + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void zero(region_t region) +inline void zero(region_t region, optional_ref stream = {}) { - zero(region.start(), region.size()); + zero(region.start(), region.size(), stream); } - /** * Sets all bytes of a single pointed-to value to 0 * * @param ptr pointer to a value of a certain type, accessible within * in a CUDA device's global memory + * @param stream an existing stream on which to schedule this action; may be omitted */ template -inline void zero(T* ptr) +inline void zero(T* ptr, optional_ref stream = {}) { - zero(ptr, sizeof(T)); + zero(ptr, sizeof(T), stream); } } // namespace device @@ -446,15 +466,6 @@ inline void zero(T* ptr) /// Asynchronous memory operations namespace detail_ { -/** - * Asynchronous versions of @ref memory::copy functions. - * - * - * @note Since we assume Compute Capability >= 2.0, all devices support the - * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer, - * where the data is located, and one does not have to specify this. - */ - ///@{ /** @@ -712,8 +723,9 @@ inline void copy(T(&destination)[N], T* source, optional_ref str * memory, global CUDA-device-side memory or CUDA-managed memory. * @param byte_value value to set the memory region to * @param num_bytes The amount of memory to set to @p byte_value + * @param stream A stream on which to schedule this action; may be omitted. */ -void set(void* ptr, int byte_value, size_t num_bytes); +void set(void* ptr, int byte_value, size_t num_bytes, optional_ref stream); /** * Sets all bytes in a region of memory to a fixed value @@ -724,10 +736,11 @@ void set(void* ptr, int byte_value, size_t num_bytes); * @param region the memory region to set; may be in host-side memory, * global CUDA-device-side memory or CUDA-managed memory. * @param byte_value value to set the memory region to + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void set(region_t region, int byte_value) +inline void set(region_t region, int byte_value, optional_ref stream) { - return set(region.start(), byte_value, region.size()); + return set(region.start(), byte_value, region.size(), stream); } /** @@ -735,10 +748,11 @@ inline void set(region_t region, int byte_value) * * @param region the memory region to zero-out; may be in host-side memory, * global CUDA-device-side memory or CUDA-managed memory. + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void zero(region_t region) +inline void zero(region_t region, optional_ref stream) { - return set(region, 0); + return set(region, 0, stream); } /** @@ -747,10 +761,11 @@ inline void zero(region_t region) * @param ptr the beginning of a region of memory to zero-out; may be in host-side * memory, global CUDA-device-side memory or CUDA-managed memory. * @param num_bytes the size in bytes of the region of memory to zero-out + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void zero(void* ptr, size_t num_bytes) +inline void zero(void* ptr, size_t num_bytes, optional_ref stream) { - return set(ptr, 0, num_bytes); + return set(ptr, 0, num_bytes, stream); } /** @@ -758,7 +773,8 @@ inline void zero(void* ptr, size_t num_bytes) * * @param ptr pointer to a single element of a certain type, which may * be in host-side memory, global CUDA-device-side memory or CUDA-managed - * memory + * memory. + * @param stream A stream on which to schedule this action; may be omitted. */ template inline void zero(T* ptr) @@ -1289,11 +1305,8 @@ inline void copy(void* destination, const_region_t source, optional_ref void typed_set(T* start, const T& value, size_t num_elements, optional_ref stream); -/** - * Asynchronously sets all bytes in a stretch of memory to a single value - * - * @note asynchronous version of @ref memory::set(void*, int, size_t) - * - * @param start starting address of the memory region to set, - * in a CUDA device's global memory - * @param byte_value value to set the memory region to - * @param num_bytes size of the memory region in bytes - * @param stream stream on which to schedule this action - */ -inline void set(void* start, int byte_value, size_t num_bytes, optional_ref stream) -{ - return typed_set( - static_cast(start), - static_cast(byte_value), - num_bytes, - stream); -} - /** * Asynchronously sets all bytes in a stretch of memory to 0. * - * @note asynchronous version of @ref memory::zero(void*, size_t) - * - * @param start starting address of the memory region to set, - * in a CUDA device's global memory - * @param num_bytes size of the memory region in bytes - * @param stream stream on which to schedule this action + * @param start starting address of the memory region to set, in a CUDA device's global memory + * @param num_bytes size of the memory region in bytes + * @param stream stream on which to schedule this action + * @param stream A stream on which to enqueue the operation; may be omitted. */ void zero(void* start, size_t num_bytes, optional_ref stream); -/** - * Asynchronously sets all bytes of a single pointed-to value - * to 0 (zero). - * - * @note asynchronous version of @ref memory::zero(T*) - * - * @param ptr a pointer to the value to be to zero; must be valid in the - * CUDA context of @p stream - * @param stream stream on which to schedule this action - */ -template -inline void zero(T* ptr, optional_ref stream) -{ - zero(ptr, sizeof(T), stream); -} - -} // namespace async - } // namespace device namespace inter_context { @@ -1846,10 +1819,13 @@ inline void deregister(const_region_t region) * Sets all bytes in a stretch of host-side memory to a single value * * @note a wrapper for @ref ::std::memset - * + * @param byte_value The value to set each byte in the memory region to. + */ +///@{ + +/** * @param start starting address of the memory region to set, * in host memory; can be either CUDA-allocated or otherwise. - * @param byte_value value to set the memory region to * @param num_bytes size of the memory region in bytes */ inline void set(void* start, int byte_value, size_t num_bytes) @@ -1858,6 +1834,14 @@ inline void set(void* start, int byte_value, size_t num_bytes) // TODO: Error handling? } +/** + * @param region The region of memory to set to the fixed value + */ +inline void set(region_t region, int byte_value) +{ + set(region.start(), byte_value, region.size()); +} + /** * Zero-out a region of host memory * diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp index 57d0926b..800fb0ba 100644 --- a/src/cuda/api/multi_wrapper_impls/memory.hpp +++ b/src/cuda/api/multi_wrapper_impls/memory.hpp @@ -140,11 +140,6 @@ inline void free(void* ptr) namespace async { -template -inline void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream) -{ - detail_::set(start, value, num_elements, stream.handle()); -} inline void zero(void* start, size_t num_bytes, const stream_t& stream) { @@ -409,8 +404,11 @@ inline void get_attributes(unsigned num_attributes, pointer::attribute_t* attrib namespace device { template -inline void typed_set(T* start, const T& value, size_t num_elements) +inline void typed_set(T* start, const T& value, size_t num_elements, optional_ref stream) { + if (stream) { + detail_::set(start, value, num_elements, stream->handle()); + } context::current::detail_::scoped_existence_ensurer_t ensure_some_context{}; static_assert(::std::is_trivially_copyable::value, "Non-trivially-copyable types cannot be used for setting memory"); static_assert(sizeof(T) == 1 or sizeof(T) == 2 or sizeof(T) == 4, @@ -418,25 +416,34 @@ inline void typed_set(T* start, const T& value, size_t num_elements) // TODO: Consider checking for alignment when compiling without NDEBUG status_t result {CUDA_SUCCESS}; switch(sizeof(T)) { - case 1: result = cuMemsetD8 (address(start), reinterpret_cast(value), num_elements); break; - case 2: result = cuMemsetD16(address(start), reinterpret_cast(value), num_elements); break; - case 4: result = cuMemsetD32(address(start), reinterpret_cast(value), num_elements); break; + case 1: result = stream ? + cuMemsetD8Async (address(start), reinterpret_cast(value), num_elements, stream->handle()) : + cuMemsetD8 (address(start), reinterpret_cast(value), num_elements); break; + case 2: result = stream ? + cuMemsetD16Async(address(start), reinterpret_cast(value), num_elements, stream->handle()) : + cuMemsetD16 (address(start), reinterpret_cast(value), num_elements); break; + case 4: result = stream ? + cuMemsetD32Async(address(start), reinterpret_cast(value), num_elements, stream->handle()) : + cuMemsetD32 (address(start), reinterpret_cast(value), num_elements); break; } throw_if_error_lazy(result, "Setting global device memory bytes"); } } // namespace device -inline void set(void* ptr, int byte_value, size_t num_bytes) +inline void set(void* ptr, int byte_value, size_t num_bytes, optional_ref stream) { switch ( type_of(ptr) ) { case device_: // case managed_: case unified_: - memory::device::set(ptr, byte_value, num_bytes); break; + memory::device::set(ptr, byte_value, num_bytes, stream); break; // case unregistered_: case host_: - ::std::memset(ptr, byte_value, num_bytes); break; + if (stream) { + throw ::std::invalid_argument("Asynchronous host-memory set's not currently supported"); + } else { ::std::memset(ptr, byte_value, num_bytes); } + break; default: throw runtime_error( cuda::status::invalid_value, diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp index a7e3f0cb..1912e8ac 100644 --- a/src/cuda/api/stream.hpp +++ b/src/cuda/api/stream.hpp @@ -481,7 +481,7 @@ class stream_t { { // Is it necessary to set the device? I wonder. CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_); - memory::device::async::detail_::set(start, byte_value, num_bytes, associated_stream.handle_); + memory::device::detail_::set(start, byte_value, num_bytes, associated_stream.handle_); } /// @copydoc memset(void *, int, size_t) const @@ -504,7 +504,7 @@ class stream_t { void memzero(void *start, size_t num_bytes) const { CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_); - memory::device::async::detail_::zero(start, num_bytes, associated_stream.handle_); + memory::device::detail_::zero(start, num_bytes, associated_stream.handle_); } /**