From 8fded6d640a7d449f0bb577ad9bdb0aa1ded3ad9 Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Sun, 20 Oct 2024 01:21:57 +0300 Subject: [PATCH] Unified async and non-async copy functions: * All copy functions now take an optional stream via an `optional_ref` parameter; * No longer using the `cuda::memory::async` subnamespace for any copy functions; they are all directly in `cuda::memory` * Fixes #688: Now supporting async copy using copy parameters structures * Explicitly including `memory.hpp` in `multi_wrapper_impls/memory.hpp` --- .../binaryPartitionCG/binaryPartitionCG.cu | 4 +- .../p2pBandwidthLatencyTest.cu | 2 +- .../simpleDrvRuntimePTX.cpp | 6 +- .../simpleStreams/simpleStreams.cu | 4 +- examples/other/array_management.cu | 8 +- src/cuda/api/memory.hpp | 1000 ++++++++--------- src/cuda/api/multi_wrapper_impls/memory.hpp | 176 ++- src/cuda/api/stream.hpp | 2 +- src/cuda/api/types.hpp | 1 + 9 files changed, 529 insertions(+), 674 deletions(-) diff --git a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu index 8444a562..6877015c 100644 --- a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu +++ b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu @@ -124,8 +124,8 @@ int main(int argc, const char **argv) stream.enqueue.kernel_launch(kernel, launch_config, d_inputArr.data(), d_numOfOdds.data(), d_sumOfOddEvenElems.data(), arrSize); - cuda::memory::async::copy(h_numOfOdds, d_numOfOdds, stream); - cuda::memory::async::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream); + cuda::memory::copy(h_numOfOdds, d_numOfOdds, stream); + cuda::memory::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream); stream.synchronize(); diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu index f7541c36..da2972fb 100644 --- a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu +++ b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu @@ -154,7 +154,7 @@ void enqueue_p2p_copy( // Since we assume Compute Capability >= 2.0, all devices support the // Unified Virtual Address Space, so we don't need to use // cudaMemcpyPeerAsync - cudaMemcpyAsync is enough. - cuda::memory::async::copy(dest, src, stream); + cuda::memory::copy(dest, src, stream); } } } diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp index 352d037f..8f4211a8 100644 --- a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp +++ b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp @@ -152,8 +152,8 @@ int main(int argc, char** argv) auto d_C = cuda::memory::make_unique_span(device, N); - cuda::memory::async::copy(d_A, h_A.get(), size, stream); - cuda::memory::async::copy(d_B, h_B.get(), size, stream); + cuda::memory::copy(d_A, h_A.get(), size, stream); + cuda::memory::copy(d_B, h_B.get(), size, stream); auto launch_config = cuda::launch_config_builder() .overall_size(N) @@ -164,7 +164,7 @@ int main(int argc, char** argv) stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.data(), d_B.data(), d_C.data(), N); - cuda::memory::async::copy(h_C.get(), d_C, size, stream); + cuda::memory::copy(h_C.get(), d_C, size, stream); stream.synchronize(); for (int i = 0; i < N; ++i) { diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu index 88230be3..d4d873e9 100644 --- a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu +++ b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu @@ -143,7 +143,7 @@ void run_simple_streams_example( // time memcpy from device start_event.record(); // record on the default stream, to ensure that all previous CUDA calls have completed - cuda::memory::async::copy(h_a.get(), d_a, streams[0]); + cuda::memory::copy(h_a.get(), d_a, streams[0]); stop_event.record(); stop_event.synchronize(); // block until the event is actually recorded auto time_memcpy = cuda::event::time_elapsed_between(start_event, stop_event); @@ -207,7 +207,7 @@ void run_simple_streams_example( // commence executing when all previous CUDA calls in stream x have completed for (int i = 0; i < nstreams; i++) { - cuda::memory::async::copy( + cuda::memory::copy( h_a.data() + i * params.n / nstreams, d_a.data() + i * params.n / nstreams, nbytes / nstreams, streams[i]); diff --git a/examples/other/array_management.cu b/examples/other/array_management.cu index a592d7e9..659381a0 100644 --- a/examples/other/array_management.cu +++ b/examples/other/array_management.cu @@ -97,8 +97,8 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) { // also asynchronously auto stream = device.create_stream(cuda::stream::async); - cuda::memory::async::copy(other_arr, span_out, stream); - cuda::memory::async::copy(span_in, other_arr, stream); + cuda::memory::copy(other_arr, span_out, stream); + cuda::memory::copy(span_in, other_arr, stream); device.synchronize(); check_output_is_iota("copy from (managed) global memory into a 3D array, asynchronously", span_in); } @@ -162,8 +162,8 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h) // also asynchronously auto stream = cuda::stream::create(device, cuda::stream::async); - cuda::memory::async::copy(other_arr, span_out, stream); - cuda::memory::async::copy(span_in, other_arr, stream); + cuda::memory::copy(other_arr, span_out, stream); + cuda::memory::copy(span_in, other_arr, stream); device.synchronize(); check_output_is_iota("copy from (managed) global memory into a 2D array, asynchronously", span_in); diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index cd8e9dd7..ff612849 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -424,68 +424,183 @@ inline void zero(T* ptr) } // namespace device +/// Asynchronous memory operations +namespace detail_ { + /** - * @note Since we assume Compute Capability >= 2.0, all devices support the - * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer, - * used in a copy function, where the data is located, and one does not have to specify this. + * Asynchronous versions of @ref memory::copy functions. * - * @note the sources and destinations may all be in any memory space addressable - * in the the unified virtual address space, which could be host-side memory, - * device global memory, device constant memory etc. * + * @note Since we assume Compute Capability >= 2.0, all devices support the + * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer, + * where the data is located, and one does not have to specify this. */ + ///@{ /** - * Synchronously copy data between different locations in memory + * Asynchronously copies data between memory spaces or within a memory space, but + * within a single CUDA context. * - * @param source A pointer to a a memory region of size @p num_bytes. - * @param num_bytes The number of bytes to copy from @p source to @p destination - */ -void copy(void *destination, const void *source, size_t num_bytes); - -/** - * @param destination A memory region of the same size as @p source. - * @param source A region whose contents is to be copied. - */ -inline void copy(void* destination, const_region_t source) + * @param destination A pointer to a memory region of size @p num_bytes, either in + * host memory or on any CUDA device's global memory + * @param source A pointer to a memory region of size at least @p num_bytes, either in + * host memory or on any CUDA device's global memory + * @param num_bytes number of bytes to copy from @p source + * @param stream_handle The handle of a stream on which to schedule the copy operation +*/ +inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle) { - return copy(destination, source.start(), source.size()); + auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle); + + // TODO: Determine whether it was from host to device, device to host etc and + // add this information to the error string + throw_if_error_lazy(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle)); } /** - * @param destination A region of memory to which to copy the data in @p source, of - * size at least that of @p source , either in host memory or on any CUDA - * device's global memory. - * @param source A region whose contents is to be copied, either in host memory - * or on any CUDA device's global memory + * @param destination a memory region of size @p num_bytes, either in + * host memory or on any CUDA device's global memory + * @param source a memory region of size @p num_bytes, either in + * host memory or on any CUDA device's global memory + * @param stream_handle The handle of a stream on which to schedule the copy operation */ -inline void copy(region_t destination, const_region_t source) +inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle) { #ifndef NDEBUG if (destination.size() < source.size()) { - throw ::std::logic_error("Can't copy a large region into a smaller one"); + throw ::std::logic_error("Source size exceeds destination size"); } #endif - return copy(destination.start(), source); + copy(destination.start(), source.start(), source.size(), stream_handle); +} +///@} + +using memory::copy_parameters_t; + +inline status_t multidim_copy_in_current_context( + ::std::integral_constant, + copy_parameters_t<2> params, + optional stream_handle) +{ + // Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters + // structure holds no information about contexts. + // + // Note: The stream handle, even if present, might be the null handle; for now + // we distinguish between using the null stream handle - the default stream's - + // and using the synchronous API + return stream_handle ? + cuMemcpy2DAsync(¶ms, *stream_handle) : + cuMemcpy2D(¶ms); +} + +inline status_t multidim_copy_in_current_context( + ::std::integral_constant, + copy_parameters_t<3> params, + optional stream_handle) +{ + if (params.srcContext == params.dstContext) { + // TODO: Should we check it's also the current context? + using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type; + auto* intra_context_params = reinterpret_cast(¶ms); + return stream_handle ? + cuMemcpy3DAsync(intra_context_params, *stream_handle) : + cuMemcpy3D(intra_context_params); + } + return stream_handle ? + cuMemcpy3DPeerAsync(¶ms, *stream_handle) : + cuMemcpy3DPeer(¶ms); +} + +template +status_t multidim_copy_in_current_context(copy_parameters_t params, optional stream_handle) { + return multidim_copy_in_current_context(::std::integral_constant{}, params, stream_handle); +} + +// Note: Assumes the stream handle is for a stream in the current context +template +status_t multidim_copy( + context::handle_t context_handle, + copy_parameters_t params, + optional stream_handle) +{ + CAW_SET_SCOPE_CONTEXT(context_handle); + return multidim_copy_in_current_context(::std::integral_constant{}, params, stream_handle); +} + +// Assumes the array and the stream share the same context, and that the destination is +// accessible from that context (e.g. allocated within it, or being managed memory, etc.) +template +void copy(T *destination, const array_t& source, optional stream_handle) +{ + using memory::endpoint_t; + auto dims = source.dimensions(); + //auto params = make_multidim_copy_params(destination, const_cast(source), destination.dimensions()); + auto params = copy_parameters_t {}; + params.clear_offset(endpoint_t::source); + params.clear_offset(endpoint_t::destination); + params.template set_extent(dims); + params.set_endpoint(endpoint_t::source, source); + params.set_endpoint(endpoint_t::destination, const_cast(destination), dims); + params.set_default_pitches(); + params.clear_rest(); + auto status = multidim_copy_in_current_context(params, stream_handle); + throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region"); +} + + +template +void copy(const array_t& destination, const T* source, optional stream_handle) +{ + using memory::endpoint_t; + auto dims = destination.dimensions(); + //auto params = make_multidim_copy_params(destination, const_cast(source), destination.dimensions()); + auto params = copy_parameters_t{}; + params.clear_offset(endpoint_t::source); + params.clear_offset(endpoint_t::destination); + params.template set_extent(dims); + params.set_endpoint(endpoint_t::source, const_cast(source), dims); + params.set_endpoint(endpoint_t::destination, destination); + params.set_default_pitches(); + params.clear_rest(); + auto status = multidim_copy_in_current_context(params, stream_handle); + throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array"); } /** - * @param destination A region of memory to which to copy the data in @p source, - * of size at least that of @p source. - * @param source A plain array whose contents is to be copied. + * Synchronously copies a single (typed) value between memory spaces or within a memory space. + * + * @note asynchronous version of @ref memory::copy_single + * + * @note assumes the source and destination are all valid in the same context as that of the + * context handle + * + * @param destination a value residing either in host memory or on any CUDA device's + * global memory + * @param source a value residing either in host memory or on any CUDA device's global + * memory + * @param stream_handle A stream on which to enqueue the copy operation */ -template -inline void copy(region_t destination, const T(&source)[N]) +template +void copy_single(T* destination, const T* source, optional stream_handle) { -#ifndef NDEBUG - if (destination.size() < N * sizeof(T)) { - throw ::std::logic_error("Source size exceeds destination size"); - } -#endif - return copy(destination.start(), source, sizeof(T) * N); + copy(destination, source, sizeof(T), stream_handle); } +} // namespace detail_ + +/** + * @note Since we assume Compute Capability >= 2.0, all devices support the + * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer, + * used in a copy function, where the data is located, and one does not have to specify this. + * + * @note the sources and destinations may all be in any memory space addressable + * in the the unified virtual address space, which could be host-side memory, + * device global memory, device constant memory etc. + * + */ +///@{ + /** * Copy the contents of a C-style array into a span of same-type elements * @@ -495,7 +610,7 @@ inline void copy(region_t destination, const T(&source)[N]) * of the first element, there is no array-decay. */ template -inline void copy(span destination, const T(&source)[N]) +inline void copy(span destination, const T(&source)[N], optional_ref stream = {}) { #ifndef NDEBUG if (destination.size() < N) { @@ -505,29 +620,6 @@ inline void copy(span destination, const T(&source)[N]) return copy(destination.data(), source, sizeof(T) * N); } -/** - * Copy the contents of memory region into a C-style array, interpreting the memory - * as a sequence of elements of the array's element type - * - * @param destination A region of memory to which to copy the data in @p source, - * of size at least that of @p source. - * @param source A region of at least `sizeof(T)*N` bytes with whose data to fill - * the @p destination array. - */ -template -inline void copy(T(&destination)[N], const_region_t source) -{ -#ifndef NDEBUG - size_t required_size = N * sizeof(T); - if (source.size() != required_size) { - throw ::std::invalid_argument( - "Attempt to copy a region of " + ::std::to_string(source.size()) + - " bytes into an array of size " + ::std::to_string(required_size) + " bytes"); - } -#endif - return copy(destination, source.start(), sizeof(T) * N); -} - /** * Copy the contents of a span into a C-style array * @@ -538,7 +630,7 @@ inline void copy(T(&destination)[N], const_region_t source) * containing the data to be copied */ template -inline void copy(T(&destination)[N], span source) +inline void copy(T(&destination)[N], span source, optional_ref stream = {}) { #ifndef NDEBUG if (source.size() > N) { @@ -547,7 +639,7 @@ inline void copy(T(&destination)[N], span source) " elements into an array of " + ::std::to_string(N) + " elements"); } #endif - return copy(destination, source.start(), sizeof(T) * N); + return copy(destination, source.start(), sizeof(T) * N, stream); } /** @@ -560,9 +652,9 @@ inline void copy(T(&destination)[N], span source) * of the first element, there is no array-decay. */ template -inline void copy(void* destination, T (&source)[N]) +inline void copy(void* destination, T (&source)[N], optional_ref stream = {}) { - return copy(destination, source, sizeof(T) * N); + return copy(destination, source, sizeof(T) * N, stream); } /** @@ -572,60 +664,23 @@ inline void copy(void* destination, T (&source)[N]) * @p source,of size at least that of @p source.; as it is taken by reference * rather than by address of the first element, there is no array-decay. * @param source The starting address of a sequence of @tparam N elements to copy - */ -template -inline void copy(T(&destination)[N], T* source) -{ - return copy(destination, source, sizeof(T) * N); -} - -/** - * Copy one region of memory into another * - * @param destination A region of memory to which to copy the data in @p source, - * of size at least that of @p source. - * @param source A pointer to a a memory region of size @p num_bytes. - * @param num_bytes The number of bytes to copy from @p source to @p destination - */ -inline void copy(region_t destination, void* source, size_t num_bytes) -{ -#ifndef NDEBUG - if (destination.size() < num_bytes) { - throw ::std::logic_error("Number of bytes to copy exceeds destination size"); - } -#endif - return copy(destination.start(), source, num_bytes); -} - -/** - * Copy one region of memory to another location +** + * Asynchronously copies data from a memory region into a C-style array * - * @param destination The beginning of a target region of memory (of size at least - * @p num_bytes) into which to copy - * @param source A region of memory from which to copy, of size at least @p num_bytes - * @param num_bytes The number of bytes to copy from @p source to @p destination + * @param destination A fixed-size C-style array, to which to copy the data in + * @p source,of size at least that of @p source.; as it is taken by reference + * rather than by address of the first element, there is no array-decay. + * @param source The starting address of a sequence of @tparam N elements to copy + * @param stream schedule the copy operation in this CUDA stream */ -inline void copy(void* destination, const_region_t source, size_t num_bytes) +template +inline void copy(T(&destination)[N], T* source, optional_ref stream = {}) { -#ifndef NDEBUG - if (source.size() < num_bytes) { - throw ::std::logic_error("Number of bytes to copy exceeds source size"); - } -#endif - return copy(destination, source.start(), num_bytes); + return copy(destination, source, sizeof(T) * N, stream); } -/** - * Copy memory between memory regions - * - * @param destination A target region of memory into which to copy; enough memory will - * be copied to fill this region - * @param source The beginning of a region of memory from which to copy - */ -inline void copy(region_t destination, void* source) -{ - return copy(destination, source, destination.size()); -} + ///@} /** @@ -694,38 +749,45 @@ inline void zero(T* ptr) namespace detail_ { -inline status_t multidim_copy(::std::integral_constant, copy_parameters_t<2> params) +inline status_t multidim_copy(::std::integral_constant two, copy_parameters_t<2> params, optional stream_handle) { // TODO: Move this logic into the scoped ensurer class auto context_handle = context::current::detail_::get_handle(); if (context_handle != context::detail_::none) { - return cuMemcpy2D(¶ms); + return detail_::multidim_copy_in_current_context(two, params, stream_handle); } auto current_device_id = cuda::device::current::detail_::get_id(); context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id); context::current::detail_::push(context_handle); // Note this _must_ be an intra-context copy, as inter-context is not supported // and there's no indication of context in the relevant data structures - auto status = cuMemcpy2D(¶ms); + auto status = detail_::multidim_copy_in_current_context(two, params, stream_handle); context::current::detail_::pop(); cuda::device::primary_context::detail_::decrease_refcount(current_device_id); return status; } -inline status_t multidim_copy(::std::integral_constant, copy_parameters_t<3> params) +inline status_t multidim_copy(context::handle_t context_handle, ::std::integral_constant, copy_parameters_t<2> params, optional stream_handle) +{ + context::current::detail_::scoped_override_t context_for_this_scope(context_handle); + return multidim_copy(::std::integral_constant{}, params, stream_handle); +} + +inline status_t multidim_copy(::std::integral_constant, copy_parameters_t<3> params, optional stream_handle) { if (params.srcContext == params.dstContext) { context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext}; - auto *intra_context_params = reinterpret_cast::intra_context_type *>(¶ms); - return cuMemcpy3D(intra_context_params); + return detail_::multidim_copy_in_current_context(params, stream_handle); } - return cuMemcpy3DPeer(¶ms); + return stream_handle ? + cuMemcpy3DPeerAsync(¶ms, *stream_handle) : + cuMemcpy3DPeer(¶ms); } template -status_t multidim_copy(copy_parameters_t params) +status_t multidim_copy(copy_parameters_t params, stream::handle_t stream_handle) { - return multidim_copy(::std::integral_constant{}, params); + return multidim_copy(::std::integral_constant{}, params, stream_handle); } @@ -742,11 +804,7 @@ status_t multidim_copy(copy_parameters_t params) * merely pass it on to the CUDA driver */ template -void copy(copy_parameters_t params) -{ - status_t status = detail_::multidim_copy(params); - throw_if_error_lazy(status, "Copying using a general copy parameters structure"); -} +void copy(copy_parameters_t params, optional_ref stream = {}); /** * Synchronously copies data from a CUDA array into non-array memory. @@ -762,7 +820,7 @@ void copy(copy_parameters_t params) * the target array context */ template -void copy(const array_t& destination, const context_t& source_context, const T *source) +void copy(const array_t& destination, const context_t& source_context, const T *source, optional_ref stream = {}) { auto dims = destination.dimensions(); auto params = copy_parameters_t {}; @@ -771,7 +829,7 @@ void copy(const array_t& destination, const context_t& source_ params.set_endpoint(endpoint_t::source, source_context.handle(), const_cast(source), dims); params.set_endpoint(endpoint_t::destination, destination); params.clear_rest(); - copy(params); + copy(params, stream); } /** @@ -783,12 +841,17 @@ void copy(const array_t& destination, const context_t& source_ * @param destination A {@tparam NumDimensions}-dimensional CUDA array * @param source A pointer to a region of contiguous memory holding `destination.size()` values * of type @tparam T. The memory may be located either on a CUDA device or in host memory. + * + * Asynchronously copies data into a CUDA array. + * + * @note asynchronous version of @ref memory::copy(array_t&, const T*) + * + * @param destination A CUDA array to copy data into + * @param source A pointer to a a memory region of size `destination.size() * sizeof(T)` + * @param stream schedule the copy operation into this CUDA stream */ -template -void copy(const array_t& destination, const T *source) -{ - copy(destination, context_of(source), source); -} +template +void copy(array_t& destination, const T* source, optional_ref stream = {}); /** * Copies a contiguous sequence of elements in memory into a CUDA array @@ -799,7 +862,7 @@ void copy(const array_t& destination, const T *source) * in the source span are ignored */ template -void copy(const array_t& destination, span source) +void copy(const array_t& destination, span source, optional_ref stream = {}) { #ifndef NDEBUG if (destination.size() < source.size()) { @@ -808,7 +871,7 @@ void copy(const array_t& destination, span source) " elements into a CUDA array of " + ::std::to_string(destination.size()) + " elements"); } #endif - copy(destination, source.data()); + copy(destination, source.data(), stream); } /** @@ -822,7 +885,7 @@ void copy(const array_t& destination, span source) * @param source A {@tparam NumDimensions}-dimensional CUDA array */ template -void copy(const context_t& context, T *destination, const array_t& source) +void copy(const context_t& context, T *destination, const array_t& source, optional_ref stream = {}) { auto dims = source.dimensions(); auto params = copy_parameters_t {}; @@ -833,7 +896,7 @@ void copy(const context_t& context, T *destination, const array_t(endpoint_t::destination, context.handle(), destination, dims); params.set_default_pitches(); params.clear_rest(); - copy(params); + copy(params, stream); } /** @@ -845,12 +908,18 @@ void copy(const context_t& context, T *destination, const array_t -void copy(T *destination, const array_t& source) -{ - copy(context_of(destination), destination, source); -} +void copy(T* destination, const array_t& source, optional_ref stream = {}); + /** * Copies the contents of a CUDA array into a sequence of contiguous elements in memory @@ -860,7 +929,7 @@ void copy(T *destination, const array_t& source) * @note The @p destination span must be at least as larger as the volume of the array. */ template -void copy(span destination, const array_t& source) +void copy(span destination, const array_t& source, optional_ref stream = {}) { #ifndef NDEBUG if (destination.size() < source.size()) { @@ -869,7 +938,7 @@ void copy(span destination, const array_t& source) " elements into a span of " + ::std::to_string(destination.size()) + " elements"); } #endif - copy(destination.data(), source); + copy(destination.data(), source, stream); } /** @@ -880,7 +949,7 @@ void copy(span destination, const array_t& source) * @note The destination array must be at least as large in each dimension as the source array. */ template -void copy(const array_t& destination, const array_t& source) +void copy(const array_t& destination, const array_t& source, optional_ref stream) { auto dims = source.dimensions(); auto params = copy_parameters_t {}; @@ -890,9 +959,9 @@ void copy(const array_t& destination, const array_t(source.context_handle(), params); + detail_::multidim_copy(source.context_handle(), params, stream); throw_if_error_lazy(status, "Copying from a CUDA array into a regular memory region"); } @@ -903,14 +972,27 @@ void copy(const array_t& destination, const array_t -void copy(region_t destination, const array_t& source) +void copy(region_t destination, const array_t& source, optional_ref stream = {}) { +#ifndef NDEBUG if (destination.size() < source.size_bytes()) { - throw ::std::logic_error("Attempt to copy an array into a memory region too small to hold the copy"); + throw ::std::invalid_argument( + "Attempt to copy " + ::std::to_string(source.size_bytes()) + " bytes from an array into a " + "region of smaller size (" + ::std::to_string(destination.size()) + " bytes)"); } - copy(destination.start(), source); +#endif + copy(destination.start(), source, stream); } /** @@ -920,14 +1002,23 @@ void copy(region_t destination, const array_t& source) * * @note only as many elements as fit in the array are copied, while the source region may * be larger than what they take up. + * + * @param destination A CUDA array to copy data into + * @param source A memory region of size `destination.size() * sizeof(T)` + * @param stream schedule the copy operation into this CUDA stream (or leave empty for a + * synchronous copy) */ template -void copy(const array_t& destination, const_region_t source) +void copy(array_t& destination, const_region_t source, optional_ref stream = {}) { +#ifndef NDEBUG if (destination.size_bytes() < source.size()) { - throw ::std::logic_error("Attempt to copy into an array from a source region larger than the array's size"); + throw ::std::invalid_argument( + "Attempt to copy a region of " + ::std::to_string(source.size()) + + " bytes into an array of size " + ::std::to_string(destination.size_bytes()) + " bytes"); } - copy(destination, static_cast(source.start())); +#endif + copy(destination, static_cast(source.start()), stream); } /** @@ -937,169 +1028,17 @@ void copy(const array_t& destination, const_region_t source) * device's global memory * @param source a value residing either in host memory or on any CUDA * device's global memory - */ -template -void copy_single(T* destination, const T* source) -{ - copy(destination, source, sizeof(T)); -} - -/// Asynchronous memory operations -namespace async { - -namespace detail_ { - -/** - * Asynchronous versions of @ref memory::copy functions. * + * Copy a single (typed) value between memory locations * - * @note Since we assume Compute Capability >= 2.0, all devices support the - * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer, - * where the data is located, and one does not have to specify this. - */ - -///@{ - -/** - * Asynchronously copies data between memory spaces or within a memory space, but - * within a single CUDA context. - * - * @param destination A pointer to a memory region of size @p num_bytes, either in - * host memory or on any CUDA device's global memory - * @param source A pointer to a memory region of size at least @p num_bytes, either in - * host memory or on any CUDA device's global memory - * @param num_bytes number of bytes to copy from @p source - * @param stream_handle The handle of a stream on which to schedule the copy operation -*/ -inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle) -{ - auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle); - - // TODO: Determine whether it was from host to device, device to host etc and - // add this information to the error string - throw_if_error_lazy(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle)); -} - -/** - * @param destination a memory region of size @p num_bytes, either in - * host memory or on any CUDA device's global memory - * @param source a memory region of size @p num_bytes, either in - * host memory or on any CUDA device's global memory - * @param stream_handle The handle of a stream on which to schedule the copy operation - */ -inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle) -{ -#ifndef NDEBUG - if (destination.size() < source.size()) { - throw ::std::logic_error("Source size exceeds destination size"); - } -#endif - copy(destination.start(), source.start(), source.size(), stream_handle); -} -///@} - -using memory::copy_parameters_t; - -inline status_t multidim_copy_in_current_context( - ::std::integral_constant, - copy_parameters_t<2> params, - stream::handle_t stream_handle) -{ - // Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters - // structure holds no information about contexts. - return cuMemcpy2DAsync(¶ms, stream_handle); -} - -inline status_t multidim_copy_in_current_context( - ::std::integral_constant, - copy_parameters_t<3> params, - stream::handle_t stream_handle) -{ - if (params.srcContext == params.dstContext) { - using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type; - auto* intra_context_params = reinterpret_cast(¶ms); - return cuMemcpy3DAsync(intra_context_params, stream_handle); - } - return cuMemcpy3DPeerAsync(¶ms, stream_handle); - -} - -template -status_t multidim_copy_in_current_context(copy_parameters_t params, stream::handle_t stream_handle) { - return multidim_copy_in_current_context(::std::integral_constant{}, params, stream_handle); -} - -// Note: Assumes the stream handle is for a stream in the current context -template -status_t multidim_copy( - context::handle_t context_handle, - copy_parameters_t params, - stream::handle_t stream_handle) -{ - CAW_SET_SCOPE_CONTEXT(context_handle); - return multidim_copy_in_current_context(::std::integral_constant{}, params, stream_handle); -} - -// Assumes the array and the stream share the same context, and that the destination is -// accessible from that context (e.g. allocated within it, or being managed memory, etc.) -template -void copy(T *destination, const array_t& source, stream::handle_t stream_handle) -{ - using memory::endpoint_t; - auto dims = source.dimensions(); - //auto params = make_multidim_copy_params(destination, const_cast(source), destination.dimensions()); - auto params = copy_parameters_t {}; - params.clear_offset(endpoint_t::source); - params.clear_offset(endpoint_t::destination); - params.template set_extent(dims); - params.set_endpoint(endpoint_t::source, source); - params.set_endpoint(endpoint_t::destination, const_cast(destination), dims); - params.set_default_pitches(); - params.clear_rest(); - auto status = multidim_copy_in_current_context(params, stream_handle); - throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region"); -} - - -template -void copy(const array_t& destination, const T* source, stream::handle_t stream_handle) -{ - using memory::endpoint_t; - auto dims = destination.dimensions(); - //auto params = make_multidim_copy_params(destination, const_cast(source), destination.dimensions()); - auto params = copy_parameters_t{}; - params.clear_offset(endpoint_t::source); - params.clear_offset(endpoint_t::destination); - params.template set_extent(dims); - params.set_endpoint(endpoint_t::source, const_cast(source), dims); - params.set_endpoint(endpoint_t::destination, destination); - params.set_default_pitches(); - params.clear_rest(); - auto status = multidim_copy_in_current_context(params, stream_handle); - throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array"); -} - -/** - * Synchronously copies a single (typed) value between memory spaces or within a memory space. - * - * @note asynchronous version of @ref memory::copy_single - * - * @note assumes the source and destination are all valid in the same context as that of the - * context handle + * @note asynchronous version of @ref memory::copy_single(T&, const T&) * - * @param destination a value residing either in host memory or on any CUDA device's - * global memory - * @param source a value residing either in host memory or on any CUDA device's global - * memory - * @param stream_handle A stream on which to enqueue the copy operation + * @param destination a value residing either in host memory or on any CUDA device's global memory + * @param source a value residing either in host memory or on any CUDA device's global memory + * @param stream The CUDA command queue on which this copying will be enqueued */ template -void copy_single(T* destination, const T* source, stream::handle_t stream_handle) -{ - copy(destination, source, sizeof(T), stream_handle); -} - -} // namespace detail_ +void copy_single(T* destination, const T* source, optional_ref stream = {}); /** * Asynchronously copies data between memory spaces or within a memory space. @@ -1119,29 +1058,77 @@ void copy_single(T* destination, const T* source, stream::handle_t stream_handle * @param num_bytes The number of bytes to copy from @p source to @p destination * @param stream A stream on which to enqueue the copy operation */ -void copy(void* destination, void const* source, size_t num_bytes, const stream_t& stream); +void copy(void* destination, void const* source, size_t num_bytes, optional_ref stream = {}); + /** - * Asynchronously copies data between memory regions + * Copy the contents of memory region into a C-style array, interpreting the memory + * as a sequence of elements of the array's element type * - * @param destination The beginning of a memory region of size @p num_bytes, either in host - * memory or on any CUDA device's global memory. Must be registered with, or visible in, - * in the same context as @p stream. - * @param source A memory region of size @p num_bytes, either in host memory or on any - * CUDA device's global memory. Must be defined in the same context as the stream. - * @param num_bytes The number of bytes to copy from @p source to @p destination + * @param destination A region of memory to which to copy the data in @p source, + * of size at least that of @p source. + * @param source A region of at least `sizeof(T)*N` bytes with whose data to fill + * the @p destination array. + * +** + * Asynchronously copies data from a memory region into a C-style array + * + * @param destination A fixed-size C-style array, to which to copy the data in + * @p source,of size at least that of @p source.; as it is taken by reference + * rather than by address of the first element, there is no array-decay. + * @param source A region of at least `sizeof(T)*N` bytes with whose data to fill + * the @p destination array. + * @param stream schedule the copy operation in this CUDA stream + */ +template +inline void copy(T(&destination)[N], const_region_t source, optional_ref stream = {}) +{ +#ifndef NDEBUG + size_t required_size = N * sizeof(T); + if (source.size() != required_size) { + throw ::std::invalid_argument( + "Attempt to copy a region of " + ::std::to_string(source.size()) + + " bytes into an array of size " + ::std::to_string(required_size) + " bytes"); + } +#endif + return copy(&(destination[0]), source.start(), sizeof(T) * N, stream); +} + +/** + * @note Since we assume Compute Capability >= 2.0, all devices support the + * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer, + * used in a copy function, where the data is located, and one does not have to specify this. + * + * @note the sources and destinations may all be in any memory space addressable + * in the the unified virtual address space, which could be host-side memory, + * device global memory, device constant memory etc. + * + * +** + * @param destination A region of memory to which to copy the data in @p source, + * of size at least that of @p source. + * @param source A plain array whose contents is to be copied. + * + * ** + * Asynchronously copies data from an array into a memory region + * + * @param destination A region of memory, either in host memory or on any CUDA device's + * global memory. Must be defined in the same context as the stream. + * @param source An array, either in host memory or on any CUDA device's global memory. * @param stream A stream on which to enqueue the copy operation */ -inline void copy(void* destination, const_region_t source, size_t num_bytes, const stream_t& stream) +template +inline void copy(region_t destination, const T(&source)[N], optional_ref stream = {}) { #ifndef NDEBUG - if (source.size() < num_bytes) { - throw ::std::logic_error("Attempt to copy more than the source region's size"); + if (destination.size() < N) { + throw ::std::logic_error("Source size exceeds destination size"); } #endif - copy(destination, source.start(), num_bytes, stream); + return copy(destination.start(), source, sizeof(T) * N, stream); } + /** * Asynchronously copies data between memory spaces or within a memory space. * @@ -1153,7 +1140,7 @@ inline void copy(void* destination, const_region_t source, size_t num_bytes, con * @param num_bytes The number of bytes to copy from @p source to @p destination * @param stream A stream on which to enqueue the copy operation */ -inline void copy(region_t destination, const_region_t source, size_t num_bytes, const stream_t& stream) +inline void copy(region_t destination, const_region_t source, size_t num_bytes, optional_ref stream = {}) { #ifndef NDEBUG if (destination.size() < num_bytes) { @@ -1163,22 +1150,15 @@ inline void copy(region_t destination, const_region_t source, size_t num_bytes, copy(destination.start(), source.start(), num_bytes, stream); } -/** - * Asynchronously copies data between memory regions - * - * @param destination Beginning of a memory region into which to copy data, either in host - * memory or on any CUDA device's global memory. The memory must be registered in, - * or visible within, the same context as {@p stream}. - * @param source A memory region of size @p num_bytes, either in host memory or on any CUDA - * device's global memory. Must be defined in the same context as the stream. - * @param stream A stream on which to enqueue the copy operation - */ -inline void copy(void* destination, const_region_t source, const stream_t& stream) -{ - copy(destination, source, source.size(), stream); -} /** + * @param destination A region of memory to which to copy the data in @p source, of + * size at least that of @p source , either in host memory or on any CUDA + * device's global memory. + * @param source A region whose contents is to be copied, either in host memory + * or on any CUDA device's global memory + * +** * Asynchronously copies data between memory regions * * @param destination A region of memory, either in host memory or on any CUDA device's @@ -1187,12 +1167,20 @@ inline void copy(void* destination, const_region_t source, const stream_t& strea * global memory. Must be defined in the same context as the stream. * @param stream A stream on which to enqueue the copy operation */ -inline void copy(region_t destination, const_region_t source, const stream_t& stream) +inline void copy(region_t destination, const_region_t source, optional_ref stream = {}) { copy(destination, source, source.size(), stream); } + /** + * Copy memory between memory regions + * + * @param destination A target region of memory into which to copy; enough memory will + * be copied to fill this region + * @param source The beginning of a region of memory from which to copy + * +** * Asynchronously copies data between memory regions * * @param destination A region of memory, either in host memory or on any CUDA device's @@ -1202,31 +1190,20 @@ inline void copy(region_t destination, const_region_t source, const stream_t& st * in the same context as the stream. * @param stream A stream on which to enqueue the copy operation */ -inline void copy(region_t destination, void* source, const stream_t& stream) +inline void copy(region_t destination, void* source, optional_ref stream = {}) { return copy(destination.start(), source, destination.size(), stream); } /** - * Asynchronously copies data from an array into a memory region + * Copy one region of memory into another * - * @param destination A region of memory, either in host memory or on any CUDA device's - * global memory. Must be defined in the same context as the stream. - * @param source An array, either in host memory or on any CUDA device's global memory. - * @param stream A stream on which to enqueue the copy operation - */ -template -inline void copy(region_t destination, const T(&source)[N], const stream_t& stream) -{ -#ifndef NDEBUG - if (destination.size() < N) { - throw ::std::logic_error("Source size exceeds destination size"); - } -#endif - return copy(destination.start(), source, sizeof(T) * N, stream); -} - -/** + * @param destination A region of memory to which to copy the data in @p source, + * of size at least that of @p source. + * @param source A pointer to a a memory region of size @p num_bytes. + * @param num_bytes The number of bytes to copy from @p source to @p destination + * +** * Asynchronously copies data from one region of memory to another * * @param destination A region of memory, either in host memory or on any CUDA device's @@ -1235,7 +1212,7 @@ inline void copy(region_t destination, const T(&source)[N], const stream_t& stre * @param num_bytes Amount of memory to copy * @param stream A stream on which to enqueue the copy operation */ -inline void copy(region_t destination, void* source, size_t num_bytes, const stream_t& stream) +inline void copy(region_t destination, void* source, size_t num_bytes, optional_ref stream = {}) { #ifndef NDEBUG if (destination.size() < num_bytes) { @@ -1246,127 +1223,53 @@ inline void copy(region_t destination, void* source, size_t num_bytes, const str } /** - * Asynchronously copies data into a CUDA array. - * - * @note asynchronous version of @ref memory::copy(array_t&, const T*) - * - * @param destination A CUDA array to copy data into - * @param source A pointer to a a memory region of size `destination.size() * sizeof(T)` - * @param stream schedule the copy operation into this CUDA stream - */ -template -void copy(array_t& destination, const T* source, const stream_t& stream); - -/** - * Asynchronously copies data into a CUDA array. - * - * @note asynchronous version of @ref memory::copy(array_t&, const T*) - * - * @param destination A CUDA array to copy data into - * @param source A memory region of size `destination.size() * sizeof(T)` - * @param stream schedule the copy operation into this CUDA stream - */ -template -void copy(array_t& destination, const_region_t source, const stream_t& stream) -{ -#ifndef NDEBUG - size_t required_size = destination.size() * sizeof(T); - if (source.size() != required_size) { - throw ::std::invalid_argument( - "Attempt to copy a region of " + ::std::to_string(source.size()) + - " bytes into an array of size " + ::std::to_string(required_size) + " bytes"); - } -#endif - copy(destination, static_cast(source.start()), stream); -} - -/** - * Asynchronously copies data from a CUDA array elsewhere - * - * @note asynchronous version of @ref memory::copy + * Copy one region of memory to another location * - * @param destination A pointer to a a memory region of size `source.size() * sizeof(T)` - * @param source A CUDA array @ref cuda::array_t - * @param stream schedule the copy operation into this CUDA stream - */ -template -void copy(T* destination, const array_t& source, const stream_t& stream); - -/** - * Asynchronously copies data from a CUDA array elsewhere + * @param destination The beginning of a target region of memory (of size at least + * @p num_bytes) into which to copy + * @param source A region of memory from which to copy, of size at least @p num_bytes + * @param num_bytes The number of bytes to copy from @p source to @p destination * - * @note asynchronous version of @ref memory::copy +** + * Asynchronously copies data between memory regions * - * @param destination A memory region of size `source.size() * sizeof(T)` - * @param source A CUDA array @ref cuda::array_t - * @param stream schedule the copy operation in this CUDA stream + * @param destination The beginning of a memory region of size @p num_bytes, either in host + * memory or on any CUDA device's global memory. Must be registered with, or visible in, + * in the same context as @p stream. + * @param source A memory region of size @p num_bytes, either in host memory or on any + * CUDA device's global memory. Must be defined in the same context as the stream. + * @param num_bytes The number of bytes to copy from @p source to @p destination + * @param stream A stream on which to enqueue the copy operation */ -template -void copy(region_t destination, const array_t& source, const stream_t& stream) +inline void copy(void* destination, const_region_t source, size_t num_bytes, optional_ref stream = {}) { #ifndef NDEBUG - size_t required_size = source.size() * sizeof(T); - if (destination.size() < required_size) { - throw ::std::invalid_argument( - "Attempt to copy " + ::std::to_string(required_size) + " bytes from an array into a " - "region of smaller size (" + ::std::to_string(destination.size()) + " bytes)"); + if (source.size() < num_bytes) { + throw ::std::logic_error("Attempt to copy more than the source region's size"); } #endif - copy(destination.start(), source, stream); + copy(destination, source.start(), num_bytes, stream); } /** - * Asynchronously copies data from a memory region into a C-style array + * @param destination A memory region of the same size as @p source. + * @param source A region whose contents is to be copied. * - * @param destination A fixed-size C-style array, to which to copy the data in - * @p source,of size at least that of @p source.; as it is taken by reference - * rather than by address of the first element, there is no array-decay. - * @param source The starting address of a sequence of @tparam N elements to copy - * @param stream schedule the copy operation in this CUDA stream - */ -template -inline void copy(T(&destination)[N], T* source, const stream_t& stream) -{ - return copy(destination, source, sizeof(T) * N, stream); -} - -/** - * Asynchronously copies data from a memory region into a C-style array +** + * Asynchronously copies data between memory regions * - * @param destination A fixed-size C-style array, to which to copy the data in - * @p source,of size at least that of @p source.; as it is taken by reference - * rather than by address of the first element, there is no array-decay. - * @param source A region of at least `sizeof(T)*N` bytes with whose data to fill - * the @p destination array. - * @param stream schedule the copy operation in this CUDA stream + * @param destination Beginning of a memory region into which to copy data, either in host + * memory or on any CUDA device's global memory. The memory must be registered in, + * or visible within, the same context as {@p stream}. + * @param source A memory region of size @p num_bytes, either in host memory or on any CUDA + * device's global memory. Must be defined in the same context as the stream. + * @param stream A stream on which to enqueue the copy operation */ -template -inline void copy(T(&destination)[N], const_region_t source, const stream_t& stream) +inline void copy(void* destination, const_region_t source, optional_ref stream = {}) { -#ifndef NDEBUG - size_t required_size = N * sizeof(T); - if (source.size() != required_size) { - throw ::std::invalid_argument( - "Attempt to copy a region of " + ::std::to_string(source.size()) + - " bytes into an array of size " + ::std::to_string(required_size) + " bytes"); - } -#endif - return copy(destination, source.start(), sizeof(T) * N, stream); + copy(destination, source, source.size(), stream); } -/** - * Copy a single (typed) value between memory locations - * - * @note asynchronous version of @ref memory::copy_single(T&, const T&) - * - * @param destination a value residing either in host memory or on any CUDA device's global memory - * @param source a value residing either in host memory or on any CUDA device's global memory - * @param stream The CUDA command queue on which this copying will be enqueued - */ -template -void copy_single(T* destination, const T* source, const stream_t& stream); - -} // namespace async namespace device { @@ -1430,7 +1333,7 @@ inline void typed_set(T* start, const T& value, size_t num_elements, stream::han * @param stream The stream on which to enqueue the operation. */ template -void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream); +void typed_set(T* start, const T& value, size_t num_elements, optional_ref stream); /** * Asynchronously sets all bytes in a stretch of memory to a single value @@ -1443,7 +1346,7 @@ void typed_set(T* start, const T& value, size_t num_elements, const stream_t& st * @param num_bytes size of the memory region in bytes * @param stream stream on which to schedule this action */ -inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream) +inline void set(void* start, int byte_value, size_t num_bytes, optional_ref stream) { return typed_set( static_cast(start), @@ -1462,7 +1365,7 @@ inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& s * @param num_bytes size of the memory region in bytes * @param stream stream on which to schedule this action */ -void zero(void* start, size_t num_bytes, const stream_t& stream); +void zero(void* start, size_t num_bytes, optional_ref stream); /** * Asynchronously sets all bytes of a single pointed-to value @@ -1475,7 +1378,7 @@ void zero(void* start, size_t num_bytes, const stream_t& stream); * @param stream stream on which to schedule this action */ template -inline void zero(T* ptr, const stream_t& stream) +inline void zero(T* ptr, optional_ref stream) { zero(ptr, sizeof(T), stream); } @@ -1486,51 +1389,21 @@ inline void zero(T* ptr, const stream_t& stream) namespace inter_context { -namespace detail_ { - -inline void copy( - void * destination_address, - context::handle_t destination_context, - const void * source_address, - context::handle_t source_context, - size_t num_bytes) -{ - auto status = cuMemcpyPeer( - reinterpret_cast(destination_address), - destination_context, - reinterpret_cast(source_address), - source_context, num_bytes); - throw_if_error_lazy(status, - ::std::string("Failed copying data between devices: From address ") - + cuda::detail_::ptr_as_hex(source_address) + " in " - + context::detail_::identify(source_context) + " to address " - + cuda::detail_::ptr_as_hex(destination_address) + " in " - + context::detail_::identify(destination_context) ); -} - -} // namespace detail_ - void copy( - void * destination, - const context_t& destination_context, - const void * source_address, - const context_t& source_context, - size_t num_bytes); - -inline void copy( - void * destination, - const context_t& destination_context, - const_region_t source, - const context_t& source_context) -{ - copy(destination, destination_context, source.start(), source_context, source.size()); -} - + void * destination, + const context_t& destination_context, + const void * source_address, + const context_t& source_context, + size_t num_bytes, + optional_ref stream); + +/* inline void copy( region_t destination, const context_t& destination_context, const_region_t source, - const context_t& source_context) + const context_t& source_context, + optional_ref stream) { #ifndef NDEBUG if (destination.size() < destination.size()) { @@ -1539,108 +1412,111 @@ inline void copy( " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes"); } #endif - copy(destination.start(), destination_context, source, source_context); + copy(destination.start(), destination_context, source, source_context, stream); } +*/ + + +/* template inline void copy( array_t destination, - array_t source) + array_t source, + optional_ref stream) { // for arrays, a single mechanism handles both intra- and inter-context copying - return memory::copy(destination, source); + return memory::copy(destination, source, stream); } - -namespace async { +*/ namespace detail_ { -inline void copy( - void *destination, - context::handle_t destination_context_handle, - const void *source, - context::handle_t source_context_handle, - size_t num_bytes, - stream::handle_t stream_handle) -{ - auto result = cuMemcpyPeerAsync( - device::address(destination), - destination_context_handle, - device::address(source), - source_context_handle, - num_bytes, stream_handle); - - // TODO: Determine whether it was from host to device, device to host etc and - // add this information to the error string - throw_if_error_lazy(result, "Scheduling an inter-context memory copy from " - + context::detail_::identify(source_context_handle) + " to " - + context::detail_::identify(destination_context_handle) + " on " - + stream::detail_::identify(stream_handle)); -} - /** * @param destination a memory region of size @p num_bytes, either in * host memory or on any CUDA device's global memory * @param source a memory region of size @p num_bytes, either in * host memory or on any CUDA device's global memory * @param stream_handle The handle of a stream on which to schedule the copy operation - */ + * inline void copy( region_t destination, context::handle_t destination_context_handle, const_region_t source, context::handle_t source_context_handle, - stream::handle_t stream_handle) + optional stream_handle) { #ifndef NDEBUG if (destination.size() < source.size()) { throw ::std::logic_error("Can't copy a large region into a smaller one"); } #endif - copy(destination.start(), destination_context_handle, source.start(), source_context_handle, source.size(), - stream_handle); + copy(destination.start(), destination_context_handle, source.start(), source_context_handle, source.size(), stream_handle); } + */ } // namespace detail_ /// Asynchronously copy a region of memory defined in one context into a region defined in another void copy( - void * destination_address, - context_t destination_context, - const void * source_address, - context_t source_context, - size_t num_bytes, - const stream_t& stream); + void * destination_address, + const context_t& destination_context, + const void * source_address, + const context_t& source_context, + size_t num_bytes, + optional_ref stream); /// Asynchronously copy a region of memory defined in one context into a region defined in another -void copy( - void * destination, - context_t destination_context, - const_region_t source, - context_t source_context, - const stream_t& stream); +inline void copy( + void * destination, + const context_t& destination_context, + const_region_t source, + const context_t& source_context, + optional_ref stream) +{ + copy(destination, destination_context, source.start(), source_context, source.size(), stream); +} /// Asynchronously copy a region of memory defined in one context into a region defined in another inline void copy( - region_t destination, - context_t destination_context, - const_region_t source, - context_t source_context, - const stream_t& stream); + region_t destination, + const context_t& destination_context, + const void* source, + const context_t& source_context, + optional_ref stream) +{ + copy(destination.start(), destination_context, source, source_context, destination.size(), stream); +} + +/// Asynchronously copy a region of memory defined in one context into a region defined in another +inline void copy( + region_t destination, + const context_t& destination_context, + const_region_t source, + const context_t& source_context, + optional_ref stream) +{ +#ifndef NDEBUG + if (destination.size() < destination.size()) { + throw ::std::invalid_argument( + "Attempt to copy a region of " + ::std::to_string(source.size()) + + " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes"); + } +#endif + copy(destination.start(), destination_context, source, source_context, stream); +} /// Asynchronously copy a CUDA array defined in one context into a CUDA array defined in another template inline void copy( - array_t destination, - array_t source, - const stream_t& stream) + array_t destination, + array_t source, + optional_ref stream) { // for arrays, a single mechanism handles both intra- and inter-context copying - return memory::async::copy(destination, source, stream); + return memory::copy(destination, source, stream); } -} // namespace async - } // namespace inter_context /// Host-side (= system) memory which is "pinned", i.e. resides in diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp index f9b80c65..d8ab0514 100644 --- a/src/cuda/api/multi_wrapper_impls/memory.hpp +++ b/src/cuda/api/multi_wrapper_impls/memory.hpp @@ -13,6 +13,7 @@ #include +#include "../memory.hpp" #include "../array.hpp" #include "../device.hpp" #include "../event.hpp" @@ -28,23 +29,13 @@ namespace cuda { namespace memory { -namespace async { - -inline void copy(void *destination, const void *source, size_t num_bytes, const stream_t& stream) -{ - detail_::copy(destination, source, num_bytes, stream.handle()); -} - -// Note: Assumes the source pointer is valid in the stream's context -template -inline void copy(array_t& destination, const T* source, const stream_t& stream) -{ - detail_::copy(destination, source, stream.handle()); -} - template -inline void copy(array_t& destination, span source, const stream_t& stream) +inline void copy(array_t& destination, span source, optional_ref stream) { + if (not stream) { + memory::copy(destination, source); + return; + } #ifndef NDEBUG if (source.size() != destination.size()) { throw ::std::invalid_argument( @@ -52,41 +43,63 @@ inline void copy(array_t& destination, span source, c " elements into an array of " + ::std::to_string(destination.size()) + " elements"); } #endif - detail_::copy(destination, source.data(), stream.handle()); + detail_::copy(destination, source.data(), stream->handle()); } // Note: Assumes the destination, source and stream are all usable on the same content template -inline void copy(T* destination, const array_t& source, const stream_t& stream) +inline void copy(T* destination, const array_t& source, optional_ref stream) { - if (stream.context_handle() != source.context_handle()) { + if (not stream) { + memory::copy(context_of(destination), destination, source); + return; + } + if (stream->context_handle() != source.context_handle()) { throw ::std::invalid_argument("Attempt to copy an array in" - + context::detail_::identify(source.context_handle()) + " via " - + stream::detail_::identify(stream)); + + context::detail_::identify(source.context_handle()) + " via " + + stream::detail_::identify(*stream)); } - detail_::copy(destination, source, stream.handle()); + detail_::copy(destination, source, stream->handle()); } -template -inline void copy(span destination, const array_t& source, const stream_t& stream) +template +void copy(copy_parameters_t params, optional_ref stream) { -#ifndef NDEBUG - if (destination.size() != source.size()) { - throw ::std::invalid_argument( - "Attempt to copy " + ::std::to_string(source.size()) + - " elements into an array of " + ::std::to_string(destination.size()) + " elements"); - } -#endif - copy(destination.data(), source, stream); + stream::handle_t stream_handle = stream ? stream->handle() : nullptr; + status_t status = detail_::multidim_copy(params, stream_handle); + throw_if_error_lazy(status, "Copying using a general copy parameters structure"); } + template -inline void copy_single(T* destination, const T* source, const stream_t& stream) +void copy_single(T* destination, const T* source, optional_ref stream) { - detail_::copy_single(destination, source, sizeof(T), stream.handle()); + memory::copy(destination, source, sizeof(T), stream); } -} // namespace async +// Note: Assumes the source pointer is valid in the stream's context +template +inline void copy(array_t& destination, const T* source, optional_ref stream) +{ + if (not stream) { + memory::copy(destination, context_of(source), source); + return; + } + detail_::copy(destination, source, stream->handle()); +} + +inline void copy(void *destination, const void *source, size_t num_bytes, optional_ref stream) +{ + if (not stream) { + context::current::detail_::scoped_existence_ensurer_t ensure_some_context{}; + auto result = cuMemcpy(device::address(destination), device::address(source), num_bytes); + // TODO: Determine whether it was from host to device, device to host etc and + // add this information to the error string + throw_if_error_lazy(result, "Synchronously copying data"); + return; + } + detail_::copy(destination, source, num_bytes, stream->handle()); +} namespace device { @@ -95,7 +108,6 @@ inline region_t allocate(const context_t& context, size_t size_in_bytes) return detail_::allocate(context.handle(), size_in_bytes); } - inline region_t allocate(const device_t& device, size_t size_in_bytes) { auto pc = device.primary_context(); @@ -133,62 +145,39 @@ inline void zero(void* start, size_t num_bytes, const stream_t& stream) namespace inter_context { inline void copy( - void * destination_address, - context_t destination_context, - const void * source_address, - context_t source_context, - size_t num_bytes) -{ - return detail_::copy( - destination_address, destination_context.handle(), - source_address, source_context.handle(), num_bytes); -} - -namespace async { - -inline void copy( - void * destination_address, - context_t destination_context, - const void * source_address, - context_t source_context, + void * destination, + const context_t& destination_context, + const void * source, + const context_t& source_context, size_t num_bytes, - const stream_t& stream) -{ - return detail_::copy( - destination_address, destination_context.handle(), source_address, - source_context.handle(), num_bytes, stream.handle()); -} + optional_ref stream = {}) +{ + auto status = stream ? + cuMemcpyPeer( + device::address(destination), + destination_context.handle(), + device::address(source), + source_context.handle(), + num_bytes) : + cuMemcpyPeerAsync( + device::address(destination), + destination_context.handle(), + device::address(source), + source_context.handle(), + num_bytes, + stream->handle()); -inline void copy( - region_t destination, - context_t destination_context, - const_region_t source, - context_t source_context, - const stream_t& stream) -{ -#ifndef NDEBUG - if (destination.size() < destination.size()) { - throw ::std::invalid_argument( - "Attempt to copy a region of " + ::std::to_string(source.size()) + - " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes"); - } -#endif - copy(destination.start(), destination_context, source, source_context, stream); -} - - -inline void copy( - void * destination, - context_t destination_context, - const_region_t source, - context_t source_context, - const stream_t& stream) -{ - copy(destination, destination_context, source.start(), source_context, source.size(), stream); + // TODO: Determine whether it was from host to device, device to host etc and + // add this information to the error string + throw_if_error_lazy(status, + ::std::string("Failed copying data between devices: From address ") + + cuda::detail_::ptr_as_hex(source) + " in " + + context::detail_::identify(source_context.handle()) + " to address " + + cuda::detail_::ptr_as_hex(destination) + " in " + + context::detail_::identify(destination_context.handle()) + + (stream ? " on " + stream::detail_::identify(*stream) : "")); } -} // namespace async - } // namespace inter_context namespace managed { @@ -258,9 +247,7 @@ inline void prefetch( detail_::prefetch(region, destination.id(), stream.handle()); } -inline void prefetch_to_host( - const_region_t region, - const stream_t& stream) +inline void prefetch_to_host(const_region_t region, const stream_t& stream) { detail_::prefetch(region, CU_DEVICE_CPU, stream.handle()); } @@ -406,15 +393,6 @@ inline void get_attributes(unsigned num_attributes, pointer::attribute_t* attrib } // namespace detail_ } // namespace pointer -inline void copy(void *destination, const void *source, size_t num_bytes) -{ - context::current::detail_::scoped_existence_ensurer_t ensure_some_context{}; - auto result = cuMemcpy(device::address(destination), device::address(source), num_bytes); - // TODO: Determine whether it was from host to device, device to host etc and - // add this information to the error string - throw_if_error_lazy(result, "Synchronously copying data"); -} - namespace device { template diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp index ce464aa6..cf6547e2 100644 --- a/src/cuda/api/stream.hpp +++ b/src/cuda/api/stream.hpp @@ -431,7 +431,7 @@ class stream_t { // CUDA doesn't seem to need us to be in the stream's context to enqueue the copy; // however, unfortunately, it does require us to be in _some_ context. context::current::detail_::scoped_ensurer_t ensure_we_have_a_current_scope{associated_stream.context_handle_}; - memory::async::detail_::copy(destination, source, num_bytes, associated_stream.handle_); + memory::detail_::copy(destination, source, num_bytes, associated_stream.handle_); } /// @copybrief copy(void *, const void *, size_t) const diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp index 6ec1ddb3..cc5889b0 100644 --- a/src/cuda/api/types.hpp +++ b/src/cuda/api/types.hpp @@ -29,6 +29,7 @@ #endif #include "detail/optional.hpp" +#include "detail/optional_ref.hpp" #include "detail/span.hpp" #include "detail/region.hpp" #include "detail/type_traits.hpp"