Skip to content

Commit ffbae89

Browse files
committed
Regards #689, Fixes #688: Unification of asynchronous & synchronous copy functions:
* All copy functions now take an optional stream via an `optional_ref` parameter; * No longer using the `cuda::memory::async` subnamespace for any copy functions; they are all directly in `cuda::memory` * Fixes #688: Now supporting async copy using copy parameters structures * Explicitly including `memory.hpp` in `multi_wrapper_impls/memory.hpp`
1 parent 704a853 commit ffbae89

File tree

9 files changed

+529
-674
lines changed

9 files changed

+529
-674
lines changed

examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,8 @@ int main(int argc, const char **argv)
124124

125125
stream.enqueue.kernel_launch(kernel, launch_config, d_inputArr.data(), d_numOfOdds.data(), d_sumOfOddEvenElems.data(), arrSize);
126126

127-
cuda::memory::async::copy(h_numOfOdds, d_numOfOdds, stream);
128-
cuda::memory::async::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream);
127+
cuda::memory::copy(h_numOfOdds, d_numOfOdds, stream);
128+
cuda::memory::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream);
129129

130130
stream.synchronize();
131131

examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ void enqueue_p2p_copy(
154154
// Since we assume Compute Capability >= 2.0, all devices support the
155155
// Unified Virtual Address Space, so we don't need to use
156156
// cudaMemcpyPeerAsync - cudaMemcpyAsync is enough.
157-
cuda::memory::async::copy(dest, src, stream);
157+
cuda::memory::copy(dest, src, stream);
158158
}
159159
}
160160
}

examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,8 @@ int main(int argc, char** argv)
152152
auto d_C = cuda::memory::make_unique_span<float>(device, N);
153153

154154

155-
cuda::memory::async::copy(d_A, h_A.get(), size, stream);
156-
cuda::memory::async::copy(d_B, h_B.get(), size, stream);
155+
cuda::memory::copy(d_A, h_A.get(), size, stream);
156+
cuda::memory::copy(d_B, h_B.get(), size, stream);
157157

158158
auto launch_config = cuda::launch_config_builder()
159159
.overall_size(N)
@@ -164,7 +164,7 @@ int main(int argc, char** argv)
164164

165165
stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.data(), d_B.data(), d_C.data(), N);
166166

167-
cuda::memory::async::copy(h_C.get(), d_C, size, stream);
167+
cuda::memory::copy(h_C.get(), d_C, size, stream);
168168
stream.synchronize();
169169

170170
for (int i = 0; i < N; ++i) {

examples/modified_cuda_samples/simpleStreams/simpleStreams.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ void run_simple_streams_example(
143143

144144
// time memcpy from device
145145
start_event.record(); // record on the default stream, to ensure that all previous CUDA calls have completed
146-
cuda::memory::async::copy(h_a.get(), d_a, streams[0]);
146+
cuda::memory::copy(h_a.get(), d_a, streams[0]);
147147
stop_event.record();
148148
stop_event.synchronize(); // block until the event is actually recorded
149149
auto time_memcpy = cuda::event::time_elapsed_between(start_event, stop_event);
@@ -207,7 +207,7 @@ void run_simple_streams_example(
207207
// commence executing when all previous CUDA calls in stream x have completed
208208
for (int i = 0; i < nstreams; i++)
209209
{
210-
cuda::memory::async::copy(
210+
cuda::memory::copy(
211211
h_a.data() + i * params.n / nstreams,
212212
d_a.data() + i * params.n / nstreams, nbytes / nstreams,
213213
streams[i]);

examples/other/array_management.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,8 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) {
9797

9898
// also asynchronously
9999
auto stream = device.create_stream(cuda::stream::async);
100-
cuda::memory::async::copy(other_arr, span_out, stream);
101-
cuda::memory::async::copy(span_in, other_arr, stream);
100+
cuda::memory::copy(other_arr, span_out, stream);
101+
cuda::memory::copy(span_in, other_arr, stream);
102102
device.synchronize();
103103
check_output_is_iota("copy from (managed) global memory into a 3D array, asynchronously", span_in);
104104
}
@@ -162,8 +162,8 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)
162162

163163
// also asynchronously
164164
auto stream = cuda::stream::create(device, cuda::stream::async);
165-
cuda::memory::async::copy(other_arr, span_out, stream);
166-
cuda::memory::async::copy(span_in, other_arr, stream);
165+
cuda::memory::copy(other_arr, span_out, stream);
166+
cuda::memory::copy(span_in, other_arr, stream);
167167
device.synchronize();
168168

169169
check_output_is_iota("copy from (managed) global memory into a 2D array, asynchronously", span_in);

0 commit comments

Comments
 (0)