Skip to content

Commit

Permalink
Unified async and non-async copy functions:
Browse files Browse the repository at this point in the history
* All copy functions now take an optional stream via an `optional_ref` parameter;
* No longer using the `cuda::memory::async` subnamespace for any copy functions; they are all directly in `cuda::memory`
* Fixes #688: Now supporting async copy using copy parameters structures
  • Loading branch information
eyalroz committed Nov 16, 2024
1 parent c0ecf2e commit 5881971
Show file tree
Hide file tree
Showing 9 changed files with 528 additions and 674 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ int main(int argc, const char **argv)

stream.enqueue.kernel_launch(kernel, launch_config, d_inputArr.data(), d_numOfOdds.data(), d_sumOfOddEvenElems.data(), arrSize);

cuda::memory::async::copy(h_numOfOdds, d_numOfOdds, stream);
cuda::memory::async::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream);
cuda::memory::copy(h_numOfOdds, d_numOfOdds, stream);
cuda::memory::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream);

stream.synchronize();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ void enqueue_p2p_copy(
// Since we assume Compute Capability >= 2.0, all devices support the
// Unified Virtual Address Space, so we don't need to use
// cudaMemcpyPeerAsync - cudaMemcpyAsync is enough.
cuda::memory::async::copy(dest, src, stream);
cuda::memory::copy(dest, src, stream);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,8 @@ int main(int argc, char** argv)
auto d_C = cuda::memory::make_unique_span<float>(device, N);


cuda::memory::async::copy(d_A, h_A.get(), size, stream);
cuda::memory::async::copy(d_B, h_B.get(), size, stream);
cuda::memory::copy(d_A, h_A.get(), size, stream);
cuda::memory::copy(d_B, h_B.get(), size, stream);

auto launch_config = cuda::launch_config_builder()
.overall_size(N)
Expand All @@ -164,7 +164,7 @@ int main(int argc, char** argv)

stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.data(), d_B.data(), d_C.data(), N);

cuda::memory::async::copy(h_C.get(), d_C, size, stream);
cuda::memory::copy(h_C.get(), d_C, size, stream);
stream.synchronize();

for (int i = 0; i < N; ++i) {
Expand Down
4 changes: 2 additions & 2 deletions examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ void run_simple_streams_example(

// time memcpy from device
start_event.record(); // record on the default stream, to ensure that all previous CUDA calls have completed
cuda::memory::async::copy(h_a.get(), d_a, streams[0]);
cuda::memory::copy(h_a.get(), d_a, streams[0]);
stop_event.record();
stop_event.synchronize(); // block until the event is actually recorded
auto time_memcpy = cuda::event::time_elapsed_between(start_event, stop_event);
Expand Down Expand Up @@ -207,7 +207,7 @@ void run_simple_streams_example(
// commence executing when all previous CUDA calls in stream x have completed
for (int i = 0; i < nstreams; i++)
{
cuda::memory::async::copy(
cuda::memory::copy(
h_a.data() + i * params.n / nstreams,
d_a.data() + i * params.n / nstreams, nbytes / nstreams,
streams[i]);
Expand Down
8 changes: 4 additions & 4 deletions examples/other/array_management.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) {

// also asynchronously
auto stream = device.create_stream(cuda::stream::async);
cuda::memory::async::copy(other_arr, span_out, stream);
cuda::memory::async::copy(span_in, other_arr, stream);
cuda::memory::copy(other_arr, span_out, stream);
cuda::memory::copy(span_in, other_arr, stream);
device.synchronize();
check_output_is_iota("copy from (managed) global memory into a 3D array, asynchronously", span_in);
}
Expand Down Expand Up @@ -162,8 +162,8 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)

// also asynchronously
auto stream = cuda::stream::create(device, cuda::stream::async);
cuda::memory::async::copy(other_arr, span_out, stream);
cuda::memory::async::copy(span_in, other_arr, stream);
cuda::memory::copy(other_arr, span_out, stream);
cuda::memory::copy(span_in, other_arr, stream);
device.synchronize();

check_output_is_iota("copy from (managed) global memory into a 2D array, asynchronously", span_in);
Expand Down
Loading

0 comments on commit 5881971

Please sign in to comment.