diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index 3ee85249..f41efadd 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -180,11 +180,30 @@ namespace detail_ { * * @param num_bytes amount of memory to allocate in bytes */ +#if CUDA_VERSION >= 11020 +inline cuda::memory::region_t allocate_in_current_context( + size_t num_bytes, optional stream_handle = {}) +#else inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes) +#endif { +#if CUDA_VERSION >= 11020 + if (stream_handle) { + device::address_t allocated = 0; + // Note: the typed cudaMalloc also takes its size in bytes, apparently, + // not in number of elements + auto status = cuMemAllocAsync(&allocated, num_bytes, *stream_handle); + if (is_success(status) && allocated == 0) { + // Can this even happen? hopefully not + status = static_cast(status::unknown); + } + throw_if_error_lazy(status, + "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) + + " bytes of global memory on " + stream::detail_::identify(*stream_handle, context::current::detail_::get_handle()) ); + return {as_pointer(allocated), num_bytes}; + } +#endif device::address_t allocated = 0; - // Note: the typed cudaMalloc also takes its size in bytes, apparently, - // not in number of elements auto status = cuMemAlloc(&allocated, num_bytes); if (is_success(status) && allocated == 0) { // Can this even happen? hopefully not @@ -195,110 +214,90 @@ inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes) return {as_pointer(allocated), num_bytes}; } -inline region_t allocate(context::handle_t context_handle, size_t size_in_bytes) +#if CUDA_VERSION >= 11020 +inline region_t allocate( + context::handle_t context_handle, + size_t size_in_bytes, + optional stream_handle = {}) +{ + CAW_SET_SCOPE_CONTEXT(context_handle); + return allocate_in_current_context(size_in_bytes, stream_handle); +} +#else +inline region_t allocate( + context::handle_t context_handle, + size_t size_in_bytes) { CAW_SET_SCOPE_CONTEXT(context_handle); return allocate_in_current_context(size_in_bytes); } - -} // namespace detail_ +#endif #if CUDA_VERSION >= 11020 -namespace async { - -namespace detail_ { - -/// Allocate memory asynchronously on a specified stream. -inline region_t allocate( - context::handle_t context_handle, - stream::handle_t stream_handle, - size_t num_bytes) +inline void free_on_stream( + void* allocated_region_start, + stream::handle_t stream_handle) { - device::address_t allocated = 0; - // Note: the typed cudaMalloc also takes its size in bytes, apparently, - // not in number of elements - auto status = cuMemAllocAsync(&allocated, num_bytes, stream_handle); - if (is_success(status) && allocated == 0) { - // Can this even happen? hopefully not - status = static_cast(status::unknown); - } + auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle); throw_if_error_lazy(status, - "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) + - " bytes of global memory on " + stream::detail_::identify(stream_handle, context_handle) ); - return {as_pointer(allocated), num_bytes}; + "Failed scheduling an asynchronous freeing of the global memory region starting at " + + cuda::detail_::ptr_as_hex(allocated_region_start) + " on " + + stream::detail_::identify(stream_handle)); } +#endif // CUDA_VERSION >= 11020 -} // namespace detail_ - -/** - * Schedule an allocation of device-side memory on a CUDA stream. - * - * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable" - * (CUDA 9.0 Runtime API documentation), so probably at least 128 bytes. - * - * @throws cuda::runtime_error if scheduling fails for any reason - * - * @param stream the stream on which to register the allocation - * @param size_in_bytes the amount of memory to allocate - * @return a pointer to the region of memory which will become allocated once the stream - * completes all previous tasks and proceeds to also complete the allocation. - */ -region_t allocate(const stream_t& stream, size_t size_in_bytes); - -} // namespace async -#endif - -/// Free a region of device-side memory (regardless of how it was allocated) -inline void free(void* ptr) +inline void free_in_current_context( + context::handle_t current_context_handle, + void* allocated_region_start) { - auto result = cuMemFree(address(ptr)); -#ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT + auto result = cuMemFree(address(allocated_region_start)); if (result == status::success) { return; } -#else - if (result == status::success or result == status::context_is_destroyed) { return; } +#ifndef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT + if (result == status::context_is_destroyed) { return; } #endif - throw runtime_error(result, "Freeing device memory at " + cuda::detail_::ptr_as_hex(ptr)); + throw runtime_error(result, "Freeing device memory at " + + cuda::detail_::ptr_as_hex(allocated_region_start) + + " in " + context::detail_::identify(current_context_handle)); } -/// @copydoc free(void*) -inline void free(region_t region) { free(region.start()); } +} // namespace detail_ +/// Free a region of device-side memory (regardless of how it was allocated) #if CUDA_VERSION >= 11020 -namespace async { - -namespace detail_ { +inline void free(void* region_start, optional_ref stream = {}); +#else +inline void free(void* ptr); +#endif -inline void free( - context::handle_t context_handle, - stream::handle_t stream_handle, - void* allocated_region_start) +#if CUDA_VERSION >= 11020 +/// @copydoc free(void*, optional_ref) +inline void free(region_t region, optional_ref stream = {}) { - auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle); - throw_if_error_lazy(status, - "Failed scheduling an asynchronous freeing of the global memory region starting at " - + cuda::detail_::ptr_as_hex(allocated_region_start) + " on " - + stream::detail_::identify(stream_handle, context_handle) ); + free(region.start(), stream); } +#else +/// @copydoc free(void*) +inline void free(region_t region) +{ + free(region.start()); +} +#endif -} // namespace detail_ - +#if CUDA_VERSION >= 11020 /** - * Schedule a de-allocation of device-side memory on a CUDA stream. + * Schedule an allocation of device-side memory on a CUDA stream. * - * @throws cuda::runtime_error if freeing fails + * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable" + * (CUDA 9.0 Runtime API documentation), so probably at least 128 bytes. + * + * @throws cuda::runtime_error if scheduling fails for any reason * * @param stream the stream on which to register the allocation + * @param size_in_bytes the amount of memory to allocate + * @return a pointer to the region of memory which will become allocated once the stream + * completes all previous tasks and proceeds to also complete the allocation. */ - ///@{ -void free(const stream_t& stream, void* region_start); - -inline void free(const stream_t& stream, region_t region) -{ - free(stream, region.data()); -} -///@} - -} // namespace async +region_t allocate(size_t size_in_bytes, optional_ref stream); #endif /** @@ -335,7 +334,9 @@ namespace detail_ { // Note: Allocates _in the current context_! No current context => failure! struct allocator { - void* operator()(size_t num_bytes) const { return detail_::allocate_in_current_context(num_bytes).start(); } + void* operator()(size_t num_bytes) const { + return detail_::allocate_in_current_context(num_bytes).start(); + } }; struct deleter { @@ -354,9 +355,10 @@ struct deleter { * @param start The first location to set to @p value ; must be properly aligned. * @param value A (properly aligned) value to set T-elements to. * @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes). + * @param stream A stream on which to schedule this action; may be omitted. */ template -void typed_set(T* start, const T& value, size_t num_elements); +void typed_set(T* start, const T& value, size_t num_elements, optional_ref stream = {}); /** * Sets all bytes in a region of memory to a fixed value @@ -367,10 +369,26 @@ void typed_set(T* start, const T& value, size_t num_elements); * @param start starting address of the memory region to set, in a CUDA * device's global memory * @param num_bytes size of the memory region in bytes + * @param stream an stream on which to schedule the operation; may be omitted + * +** + * Asynchronously sets all bytes in a stretch of memory to a single value + * + * @note asynchronous version of @ref memory::set(void*, int, size_t) + * + * @param start starting address of the memory region to set, + * in a CUDA device's global memory + * @param byte_value value to set the memory region to + * @param num_bytes size of the memory region in bytes + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void set(void* start, int byte_value, size_t num_bytes) +inline void set(void* start, int byte_value, size_t num_bytes, optional_ref stream = {}) { - return typed_set(static_cast(start), static_cast(byte_value), num_bytes); + return typed_set( + static_cast(start), + static_cast(byte_value), + num_bytes, + stream); } /** @@ -380,10 +398,11 @@ inline void set(void* start, int byte_value, size_t num_bytes) * * @param byte_value value to set the memory region to * @param region a region to zero-out, in a CUDA device's global memory + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void set(region_t region, int byte_value) +inline void set(region_t region, int byte_value, optional_ref stream = {}) { - set(region.start(), byte_value, region.size()); + set(region.start(), byte_value, region.size(), stream); } /** @@ -392,10 +411,11 @@ inline void set(region_t region, int byte_value) * @param start the beginning of a region of memory to zero-out, accessible * within a CUDA device's global memory * @param num_bytes the size in bytes of the region of memory to zero-out + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void zero(void* start, size_t num_bytes) +inline void zero(void* start, size_t num_bytes, optional_ref stream = {}) { - set(start, 0, num_bytes); + set(start, 0, num_bytes, stream); } /** @@ -403,23 +423,24 @@ inline void zero(void* start, size_t num_bytes) * * @param region the memory region to zero-out, accessible as a part of a * CUDA device's global memory + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void zero(region_t region) +inline void zero(region_t region, optional_ref stream = {}) { - zero(region.start(), region.size()); + zero(region.start(), region.size(), stream); } - /** * Sets all bytes of a single pointed-to value to 0 * * @param ptr pointer to a value of a certain type, accessible within * in a CUDA device's global memory + * @param stream an existing stream on which to schedule this action; may be omitted */ template -inline void zero(T* ptr) +inline void zero(T* ptr, optional_ref stream = {}) { - zero(ptr, sizeof(T)); + zero(ptr, sizeof(T), stream); } } // namespace device @@ -427,15 +448,6 @@ inline void zero(T* ptr) /// Asynchronous memory operations namespace detail_ { -/** - * Asynchronous versions of @ref memory::copy functions. - * - * - * @note Since we assume Compute Capability >= 2.0, all devices support the - * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer, - * where the data is located, and one does not have to specify this. - */ - ///@{ /** @@ -692,8 +704,9 @@ inline void copy(c_array& destination, T* source, optional_ref stream = {}); /** * Sets all bytes in a region of memory to a fixed value @@ -704,10 +717,11 @@ void set(void* ptr, int byte_value, size_t num_bytes); * @param region the memory region to set; may be in host-side memory, * global CUDA-device-side memory or CUDA-managed memory. * @param byte_value value to set the memory region to + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void set(region_t region, int byte_value) +inline void set(region_t region, int byte_value, optional_ref stream = {}) { - return set(region.start(), byte_value, region.size()); + return set(region.start(), byte_value, region.size(), stream); } /** @@ -715,10 +729,11 @@ inline void set(region_t region, int byte_value) * * @param region the memory region to zero-out; may be in host-side memory, * global CUDA-device-side memory or CUDA-managed memory. + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void zero(region_t region) +inline void zero(region_t region, optional_ref stream = {}) { - return set(region, 0); + return set(region, 0, stream); } /** @@ -727,10 +742,11 @@ inline void zero(region_t region) * @param ptr the beginning of a region of memory to zero-out; may be in host-side * memory, global CUDA-device-side memory or CUDA-managed memory. * @param num_bytes the size in bytes of the region of memory to zero-out + * @param stream A stream on which to schedule this action; may be omitted. */ -inline void zero(void* ptr, size_t num_bytes) +inline void zero(void* ptr, size_t num_bytes, optional_ref stream = {}) { - return set(ptr, 0, num_bytes); + return set(ptr, 0, num_bytes, stream); } /** @@ -738,7 +754,8 @@ inline void zero(void* ptr, size_t num_bytes) * * @param ptr pointer to a single element of a certain type, which may * be in host-side memory, global CUDA-device-side memory or CUDA-managed - * memory + * memory. + * @param stream A stream on which to schedule this action; may be omitted. */ template inline void zero(T* ptr) @@ -1269,11 +1286,8 @@ inline void copy(void* destination, const_region_t source, optional_ref void typed_set(T* start, const T& value, size_t num_elements, optional_ref stream); -/** - * Asynchronously sets all bytes in a stretch of memory to a single value - * - * @note asynchronous version of @ref memory::set(void*, int, size_t) - * - * @param start starting address of the memory region to set, - * in a CUDA device's global memory - * @param byte_value value to set the memory region to - * @param num_bytes size of the memory region in bytes - * @param stream stream on which to schedule this action - */ -inline void set(void* start, int byte_value, size_t num_bytes, optional_ref stream) -{ - return typed_set( - static_cast(start), - static_cast(byte_value), - num_bytes, - stream); -} - /** * Asynchronously sets all bytes in a stretch of memory to 0. * - * @note asynchronous version of @ref memory::zero(void*, size_t) - * - * @param start starting address of the memory region to set, - * in a CUDA device's global memory - * @param num_bytes size of the memory region in bytes - * @param stream stream on which to schedule this action + * @param start starting address of the memory region to set, in a CUDA device's global memory + * @param num_bytes size of the memory region in bytes + * @param stream stream on which to schedule this action + * @param stream A stream on which to enqueue the operation; may be omitted. */ void zero(void* start, size_t num_bytes, optional_ref stream); -/** - * Asynchronously sets all bytes of a single pointed-to value - * to 0 (zero). - * - * @note asynchronous version of @ref memory::zero(T*) - * - * @param ptr a pointer to the value to be to zero; must be valid in the - * CUDA context of @p stream - * @param stream stream on which to schedule this action - */ -template -inline void zero(T* ptr, optional_ref stream) -{ - zero(ptr, sizeof(T), stream); -} - -} // namespace async - } // namespace device namespace inter_context { @@ -1826,10 +1801,13 @@ inline void deregister(const_region_t region) * Sets all bytes in a stretch of host-side memory to a single value * * @note a wrapper for @ref ::std::memset - * + * @param byte_value The value to set each byte in the memory region to. + */ +///@{ + +/** * @param start starting address of the memory region to set, * in host memory; can be either CUDA-allocated or otherwise. - * @param byte_value value to set the memory region to * @param num_bytes size of the memory region in bytes */ inline void set(void* start, int byte_value, size_t num_bytes) @@ -1838,6 +1816,14 @@ inline void set(void* start, int byte_value, size_t num_bytes) // TODO: Error handling? } +/** + * @param region The region of memory to set to the fixed value + */ +inline void set(region_t region, int byte_value) +{ + memory::set(region.start(), byte_value, region.size(), nullopt); +} + /** * Zero-out a region of host memory * @@ -1856,7 +1842,7 @@ inline void zero(void* start, size_t num_bytes) */ inline void zero(region_t region) { - set(region, 0); + host::set(region, 0); } /** @@ -2128,8 +2114,6 @@ inline void free(region_t region) free(region.start()); } -namespace async { - namespace detail_ { inline void prefetch( @@ -2164,8 +2148,6 @@ void prefetch_to_host( const_region_t region, const stream_t& stream); -} // namespace async - } // namespace managed namespace mapped { @@ -2374,8 +2356,9 @@ namespace detail_ { template unique_span make_unique_span(const context::handle_t context_handle, size_t size) { + auto allocate_in_current_context_ = [](size_t size) { return allocate_in_current_context(size); }; CAW_SET_SCOPE_CONTEXT(context_handle); - return memory::detail_::make_convenient_type_unique_span(size, allocate_in_current_context); + return memory::detail_::make_convenient_type_unique_span(size, allocate_in_current_context_); } } // namespace detail_ diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp index d8ab0514..3a896405 100644 --- a/src/cuda/api/multi_wrapper_impls/memory.hpp +++ b/src/cuda/api/multi_wrapper_impls/memory.hpp @@ -114,32 +114,32 @@ inline region_t allocate(const device_t& device, size_t size_in_bytes) return allocate(pc, size_in_bytes); } -namespace async { #if CUDA_VERSION >= 11020 -inline region_t allocate(const stream_t& stream, size_t size_in_bytes) +inline region_t allocate(size_t size_in_bytes, optional_ref stream = {}) { - return detail_::allocate(stream.context().handle(), stream.handle(), size_in_bytes); + return stream ? + detail_::allocate(stream->context().handle(), size_in_bytes, stream->handle()) : + detail_::allocate_in_current_context(size_in_bytes); } -inline void free(const stream_t& stream, void* region_start) -{ - return detail_::free(stream.context().handle(), stream.handle(), region_start); -} #endif // CUDA_VERSION >= 11020 -template -inline void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream) -{ - detail_::set(start, value, num_elements, stream.handle()); -} - -inline void zero(void* start, size_t num_bytes, const stream_t& stream) +#if CUDA_VERSION >= 11020 +inline void free(void* region_start, optional_ref stream) +#else +inline void free(void* region_start) +#endif // CUDA_VERSION >= 11020 { - detail_::zero(start, num_bytes, stream.handle()); +#if CUDA_VERSION >= 11020 + if (stream) { + detail_::free_on_stream(region_start, stream->handle()); + return; + } +#endif + context::current::detail_::scoped_existence_ensurer_t ensurer; + detail_::free_in_current_context(ensurer.context_handle,region_start); } -} // namespace async - } // namespace device namespace inter_context { @@ -237,8 +237,6 @@ ::std::vector expected_accessors(const_region_t region, con return devices; } -namespace async { - inline void prefetch( const_region_t region, const cuda::device_t& destination, @@ -252,8 +250,6 @@ inline void prefetch_to_host(const_region_t region, const stream_t& stream) detail_::prefetch(region, CU_DEVICE_CPU, stream.handle()); } -} // namespace async - inline region_t allocate( const context_t& context, size_t num_bytes, @@ -396,8 +392,11 @@ inline void get_attributes(unsigned num_attributes, pointer::attribute_t* attrib namespace device { template -inline void typed_set(T* start, const T& value, size_t num_elements) +inline void typed_set(T* start, const T& value, size_t num_elements, optional_ref stream) { + if (stream) { + detail_::set(start, value, num_elements, stream->handle()); + } context::current::detail_::scoped_existence_ensurer_t ensure_some_context{}; static_assert(::std::is_trivially_copyable::value, "Non-trivially-copyable types cannot be used for setting memory"); static_assert(sizeof(T) == 1 or sizeof(T) == 2 or sizeof(T) == 4, @@ -405,25 +404,34 @@ inline void typed_set(T* start, const T& value, size_t num_elements) // TODO: Consider checking for alignment when compiling without NDEBUG status_t result {CUDA_SUCCESS}; switch(sizeof(T)) { - case 1: result = cuMemsetD8 (address(start), reinterpret_cast(value), num_elements); break; - case 2: result = cuMemsetD16(address(start), reinterpret_cast(value), num_elements); break; - case 4: result = cuMemsetD32(address(start), reinterpret_cast(value), num_elements); break; + case 1: result = stream ? + cuMemsetD8Async (address(start), reinterpret_cast(value), num_elements, stream->handle()) : + cuMemsetD8 (address(start), reinterpret_cast(value), num_elements); break; + case 2: result = stream ? + cuMemsetD16Async(address(start), reinterpret_cast(value), num_elements, stream->handle()) : + cuMemsetD16 (address(start), reinterpret_cast(value), num_elements); break; + case 4: result = stream ? + cuMemsetD32Async(address(start), reinterpret_cast(value), num_elements, stream->handle()) : + cuMemsetD32 (address(start), reinterpret_cast(value), num_elements); break; } throw_if_error_lazy(result, "Setting global device memory bytes"); } } // namespace device -inline void set(void* ptr, int byte_value, size_t num_bytes) +inline void set(void* ptr, int byte_value, size_t num_bytes, optional_ref stream) { switch ( type_of(ptr) ) { case device_: // case managed_: case unified_: - memory::device::set(ptr, byte_value, num_bytes); break; + memory::device::set(ptr, byte_value, num_bytes, stream); break; // case unregistered_: case host_: - ::std::memset(ptr, byte_value, num_bytes); break; + if (stream) { + throw ::std::invalid_argument("Asynchronous host-memory set's not currently supported"); + } else { ::std::memset(ptr, byte_value, num_bytes); } + break; default: throw runtime_error( cuda::status::invalid_value, diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp index cf6547e2..1912e8ac 100644 --- a/src/cuda/api/stream.hpp +++ b/src/cuda/api/stream.hpp @@ -481,7 +481,7 @@ class stream_t { { // Is it necessary to set the device? I wonder. CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_); - memory::device::async::detail_::set(start, byte_value, num_bytes, associated_stream.handle_); + memory::device::detail_::set(start, byte_value, num_bytes, associated_stream.handle_); } /// @copydoc memset(void *, int, size_t) const @@ -504,7 +504,7 @@ class stream_t { void memzero(void *start, size_t num_bytes) const { CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_); - memory::device::async::detail_::zero(start, num_bytes, associated_stream.handle_); + memory::device::detail_::zero(start, num_bytes, associated_stream.handle_); } /** @@ -590,7 +590,7 @@ class stream_t { */ memory::region_t allocate(size_t num_bytes) const { - return memory::device::async::allocate(associated_stream, num_bytes); + return memory::device::allocate(num_bytes, associated_stream); } memory::region_t allocate(const memory::pool_t& pool, size_t num_bytes); @@ -601,14 +601,14 @@ class stream_t { ///@{ void free(void* region_start) const { - memory::device::async::free(associated_stream, region_start); + memory::device::free(region_start, associated_stream); } void free(memory::region_t region) const { - memory::device::async::free(associated_stream, region); + memory::device::free(region, associated_stream); } -#endif +#endif // CUDA_VERSION >= 11020 /** * Sets the attachment of a region of managed memory (i.e. in the address space visible