diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index ff612849..3e09d490 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -180,11 +180,30 @@ namespace detail_ { * * @param num_bytes amount of memory to allocate in bytes */ +#if CUDA_VERSION >= 11020 +inline cuda::memory::region_t allocate_in_current_context( + size_t num_bytes, optional stream_handle = {}) +#else inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes) +#endif { +#if CUDA_VERSION >= 11020 + if (stream_handle) { + device::address_t allocated = 0; + // Note: the typed cudaMalloc also takes its size in bytes, apparently, + // not in number of elements + auto status = cuMemAllocAsync(&allocated, num_bytes, *stream_handle); + if (is_success(status) && allocated == 0) { + // Can this even happen? hopefully not + status = static_cast(status::unknown); + } + throw_if_error_lazy(status, + "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) + + " bytes of global memory on " + stream::detail_::identify(*stream_handle, context::current::detail_::get_handle()) ); + return {as_pointer(allocated), num_bytes}; + } +#endif device::address_t allocated = 0; - // Note: the typed cudaMalloc also takes its size in bytes, apparently, - // not in number of elements auto status = cuMemAlloc(&allocated, num_bytes); if (is_success(status) && allocated == 0) { // Can this even happen? hopefully not @@ -195,59 +214,23 @@ inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes) return {as_pointer(allocated), num_bytes}; } -inline region_t allocate(context::handle_t context_handle, size_t size_in_bytes) -{ - CAW_SET_SCOPE_CONTEXT(context_handle); - return allocate_in_current_context(size_in_bytes); -} - -} // namespace detail_ - #if CUDA_VERSION >= 11020 -namespace async { - -namespace detail_ { - -/// Allocate memory asynchronously on a specified stream. inline region_t allocate( - context::handle_t context_handle, - stream::handle_t stream_handle, - size_t num_bytes) + context::handle_t context_handle, + size_t size_in_bytes, + optional stream_handle = {}) +#else + inline region_t allocate( + context::handle_t context_handle, + size_t size_in_bytes) +#endif { - device::address_t allocated = 0; - // Note: the typed cudaMalloc also takes its size in bytes, apparently, - // not in number of elements - auto status = cuMemAllocAsync(&allocated, num_bytes, stream_handle); - if (is_success(status) && allocated == 0) { - // Can this even happen? hopefully not - status = static_cast(status::unknown); - } - throw_if_error_lazy(status, - "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) + - " bytes of global memory on " + stream::detail_::identify(stream_handle, context_handle) ); - return {as_pointer(allocated), num_bytes}; + CAW_SET_SCOPE_CONTEXT(context_handle); + return allocate_in_current_context(size_in_bytes, stream_handle); } } // namespace detail_ -/** - * Schedule an allocation of device-side memory on a CUDA stream. - * - * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable" - * (CUDA 9.0 Runtime API documentation), so probably at least 128 bytes. - * - * @throws cuda::runtime_error if scheduling fails for any reason - * - * @param stream the stream on which to register the allocation - * @param size_in_bytes the amount of memory to allocate - * @return a pointer to the region of memory which will become allocated once the stream - * completes all previous tasks and proceeds to also complete the allocation. - */ -region_t allocate(const stream_t& stream, size_t size_in_bytes); - -} // namespace async -#endif - /// Free a region of device-side memory (regardless of how it was allocated) inline void free(void* ptr) { @@ -301,6 +284,23 @@ inline void free(const stream_t& stream, region_t region) } // namespace async #endif +#if CUDA_VERSION >= 11020 +/** + * Schedule an allocation of device-side memory on a CUDA stream. + * + * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable" + * (CUDA 9.0 Runtime API documentation), so probably at least 128 bytes. + * + * @throws cuda::runtime_error if scheduling fails for any reason + * + * @param stream the stream on which to register the allocation + * @param size_in_bytes the amount of memory to allocate + * @return a pointer to the region of memory which will become allocated once the stream + * completes all previous tasks and proceeds to also complete the allocation. + */ +region_t allocate(size_t size_in_bytes, optional_ref stream); +#endif + /** * Allocate device-side memory on a CUDA device context. * @@ -335,7 +335,9 @@ namespace detail_ { // Note: Allocates _in the current context_! No current context => failure! struct allocator { - void* operator()(size_t num_bytes) const { return detail_::allocate_in_current_context(num_bytes).start(); } + void* operator()(size_t num_bytes) const { + return detail_::allocate_in_current_context(num_bytes).start(); + } }; struct deleter { @@ -2375,8 +2377,9 @@ namespace detail_ { template unique_span make_unique_span(const context::handle_t context_handle, size_t size) { + auto allocate_in_current_context_ = [](size_t size) { return allocate_in_current_context(size); }; CAW_SET_SCOPE_CONTEXT(context_handle); - return memory::detail_::make_convenient_type_unique_span(size, allocate_in_current_context); + return memory::detail_::make_convenient_type_unique_span(size, allocate_in_current_context_); } } // namespace detail_ diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp index 9083b5ba..b6635340 100644 --- a/src/cuda/api/multi_wrapper_impls/memory.hpp +++ b/src/cuda/api/multi_wrapper_impls/memory.hpp @@ -113,13 +113,16 @@ inline region_t allocate(const device_t& device, size_t size_in_bytes) return allocate(pc, size_in_bytes); } -namespace async { #if CUDA_VERSION >= 11020 -inline region_t allocate(const stream_t& stream, size_t size_in_bytes) +inline region_t allocate(size_t size_in_bytes, optional_ref stream = {}) { - return detail_::allocate(stream.context().handle(), stream.handle(), size_in_bytes); + return stream ? + detail_::allocate(stream->context().handle(), size_in_bytes, stream->handle()) : + detail_::allocate_in_current_context(size_in_bytes); } +namespace async { + inline void free(const stream_t& stream, void* region_start) { return detail_::free(stream.context().handle(), stream.handle(), region_start); diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp index cf6547e2..953c034d 100644 --- a/src/cuda/api/stream.hpp +++ b/src/cuda/api/stream.hpp @@ -590,7 +590,7 @@ class stream_t { */ memory::region_t allocate(size_t num_bytes) const { - return memory::device::async::allocate(associated_stream, num_bytes); + return memory::device::allocate(num_bytes, associated_stream); } memory::region_t allocate(const memory::pool_t& pool, size_t num_bytes);