eyalroz
diff --git a/‎examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
Lines changed: 2 additions & 2 deletions b/‎examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
Lines changed: 1 addition & 1 deletion b/‎examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
Lines changed: 3 additions & 3 deletions b/‎examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
Lines changed: 2 additions & 2 deletions b/‎examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/other/array_management.cu
Lines changed: 4 additions & 4 deletions b/‎examples/other/array_management.cu
Lines changed: 4 additions & 4 deletions
@@ -124,8 +124,8 @@ int main(int argc, const char **argv)
 
 	stream.enqueue.kernel_launch(kernel, launch_config, d_inputArr.data(), d_numOfOdds.data(), d_sumOfOddEvenElems.data(), arrSize);
 
-	cuda::memory::async::copy(h_numOfOdds, d_numOfOdds, stream);
-	cuda::memory::async::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream);
+	cuda::memory::copy(h_numOfOdds, d_numOfOdds, stream);
+	cuda::memory::copy(h_sumOfOddEvenElems, d_sumOfOddEvenElems, stream);
 
 	stream.synchronize();
 
 
@@ -154,7 +154,7 @@ void enqueue_p2p_copy(
         // Since we assume Compute Capability >= 2.0, all devices support the
         // Unified Virtual Address Space, so we don't need to use
         // cudaMemcpyPeerAsync - cudaMemcpyAsync is enough.
-            cuda::memory::async::copy(dest, src, stream);
+            cuda::memory::copy(dest, src, stream);
         }
     }
 }
 
@@ -152,8 +152,8 @@ int main(int argc, char** argv)
 	auto d_C = cuda::memory::make_unique_span<float>(device, N);
 
 
-	cuda::memory::async::copy(d_A, h_A.get(), size, stream);
-	cuda::memory::async::copy(d_B, h_B.get(), size, stream);
+	cuda::memory::copy(d_A, h_A.get(), size, stream);
+	cuda::memory::copy(d_B, h_B.get(), size, stream);
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(N)
@@ -164,7 +164,7 @@ int main(int argc, char** argv)
 
     stream.enqueue.kernel_launch(vecAdd_kernel, launch_config, d_A.data(), d_B.data(), d_C.data(), N);
 
-	cuda::memory::async::copy(h_C.get(), d_C, size, stream);
+	cuda::memory::copy(h_C.get(), d_C, size, stream);
 	stream.synchronize();
 
 	for (int i = 0; i < N; ++i) {
 
@@ -143,7 +143,7 @@ void run_simple_streams_example(
 
 	// time memcpy from device
 	start_event.record(); // record on the default stream, to ensure that all previous CUDA calls have completed
-	cuda::memory::async::copy(h_a.get(), d_a, streams[0]);
+	cuda::memory::copy(h_a.get(), d_a, streams[0]);
 	stop_event.record();
 	stop_event.synchronize(); // block until the event is actually recorded
 	auto time_memcpy = cuda::event::time_elapsed_between(start_event, stop_event);
@@ -207,7 +207,7 @@ void run_simple_streams_example(
 		//   commence executing when all previous CUDA calls in stream x have completed
 		for (int i = 0; i < nstreams; i++)
 		{
-			cuda::memory::async::copy(
+			cuda::memory::copy(
 				h_a.data() + i * params.n / nstreams,
 				d_a.data() + i * params.n / nstreams, nbytes / nstreams,
 				streams[i]);
 
@@ -97,8 +97,8 @@ void array_3d_example(cuda::device_t& device, size_t w, size_t h, size_t d) {
 
 	// also asynchronously
 	auto stream = device.create_stream(cuda::stream::async);
-	cuda::memory::async::copy(other_arr, span_out, stream);
-	cuda::memory::async::copy(span_in, other_arr, stream);
+	cuda::memory::copy(other_arr, span_out, stream);
+	cuda::memory::copy(span_in, other_arr, stream);
 	device.synchronize();
 	check_output_is_iota("copy from (managed) global memory into a 3D array, asynchronously", span_in);
 }
@@ -162,8 +162,8 @@ void array_2d_example(cuda::device_t& device, size_t w, size_t h)
 
 	// also asynchronously
 	auto stream = cuda::stream::create(device, cuda::stream::async);
-	cuda::memory::async::copy(other_arr, span_out, stream);
-	cuda::memory::async::copy(span_in, other_arr, stream);
+	cuda::memory::copy(other_arr, span_out, stream);
+	cuda::memory::copy(span_in, other_arr, stream);
 	device.synchronize();
 
 	check_output_is_iota("copy from (managed) global memory into a 2D array, asynchronously", span_in);
Original file line number	Diff line number	Diff line change
`@@ -154,7 +154,7 @@ void enqueue_p2p_copy(`
`154`	`154`	`// Since we assume Compute Capability >= 2.0, all devices support the`
`155`	`155`	`// Unified Virtual Address Space, so we don't need to use`
`156`	`156`	`// cudaMemcpyPeerAsync - cudaMemcpyAsync is enough.`
`157`		`- cuda::memory::async::copy(dest, src, stream);`
	`157`	`+ cuda::memory::copy(dest, src, stream);`
`158`	`158`	`}`
`159`	`159`	`}`
`160`	`160`	`}`