From d7512ab144050b3c910d22c1a5da69821ffe085e Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Mon, 12 Feb 2024 00:44:17 +0200
Subject: [PATCH] Fixes #585: Can now use `cuda::memory::make_unique()` and
 have it default to device-global memory

---
 examples/by_api_module/unified_addressing.cpp |  4 ++--
 .../asyncAPI/asyncAPI.cu                      |  2 +-
 .../binaryPartitionCG/binaryPartitionCG.cu    |  6 +++---
 .../clock_nvrtc/clock.cpp                     |  6 +++---
 .../inlinePTX/inlinePTX.cu                    |  2 +-
 .../p2pBandwidthLatencyTest.cu                | 12 +++++------
 .../simpleDrvRuntimePTX.cpp                   |  6 +++---
 .../simpleStreams/simpleStreams.cu            |  4 ++--
 .../vectorAdd/vectorAdd.cu                    |  6 +++---
 .../vectorAdd_nvrtc/vectorAdd_nvrtc.cpp       |  6 +++---
 .../vectorAdd_ptx/vectorAdd_ptx.cpp           |  6 +++---
 .../other/io_compute_overlap_with_streams.cu  |  6 +++---
 examples/other/jitify/jitify.cpp              | 10 ++++-----
 examples/other/vectorAdd_profiled.cu          |  6 +++---
 src/cuda/api/unique_ptr.hpp                   | 21 +++++++++++++++++++
 15 files changed, 62 insertions(+), 41 deletions(-)
diff --git a/examples/by_api_module/unified_addressing.cpp b/examples/by_api_module/unified_addressing.cpp
index b5d4c942..1cc545c0 100644
--- a/examples/by_api_module/unified_addressing.cpp
+++ b/examples/by_api_module/unified_addressing.cpp
@@ -29,8 +29,8 @@ void pointer_properties(const cuda::device_t& device)
 		cuda::context::create(device)
 	};
 	cuda::memory::device::unique_ptr<char[]> regions[2] = {
-		cuda::memory::device::make_unique<char[]>(contexts[0], fixed_size),
-		cuda::memory::device::make_unique<char[]>(contexts[1], fixed_size)
+		cuda::memory::make_unique<char[]>(contexts[0], fixed_size),
+		cuda::memory::make_unique<char[]>(contexts[1], fixed_size)
 	};
 	void* raw_pointers[2] = {
 		regions[0].get(),
diff --git a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
index 71dd0ebc..996f29b2 100644
--- a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
+++ b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
@@ -59,7 +59,7 @@ int main(int, char **)
 	auto a = cuda::memory::host::make_unique<datum[]>(n);
 	cuda::memory::host::zero(a.get(), num_bytes);
 
-	auto d_a = cuda::memory::device::make_unique<datum[]>(device, n);
+	auto d_a = cuda::memory::make_unique<datum[]>(device, n);
 
 	auto launch_config = cuda::launch_config_builder()
 		.overall_size(n)
diff --git a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
index 314fda6c..eaa1412b 100644
--- a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
+++ b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu
@@ -105,9 +105,9 @@ int main(int argc, const char **argv)
    
     auto stream = device.create_stream(cuda::stream::async);
 	// Note: With CUDA 11, we could allocate these asynchronously on the stream
-	auto d_inputArr = cuda::memory::device::make_unique<int[]>(device, arrSize);
-	auto d_numOfOdds = cuda::memory::device::make_unique<int>(device);
-	auto d_sumOfOddEvenElems = cuda::memory::device::make_unique<int[]>(device, 2);
+	auto d_inputArr = cuda::memory::make_unique<int[]>(device, arrSize);
+	auto d_numOfOdds = cuda::memory::make_unique<int>(device);
+	auto d_sumOfOddEvenElems = cuda::memory::make_unique<int[]>(device, 2);
 
 	// Note: There's some code repetition here; unique pointers don't also keep track of the allocated size.
 	// Unfortunately, the standard library does not offer an owning dynamically-allocated memory region
diff --git a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp
index c933f7d8..633d37dc 100644
--- a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp
+++ b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp
@@ -154,10 +154,10 @@ int main()
 	{
 		const auto dynamic_shared_mem_size = sizeof(float) * 2 * num_threads_per_block;
 
-		auto d_input = cuda::memory::device::make_unique<float[]>(device, input_size);
-		auto d_output = cuda::memory::device::make_unique<float[]>(device, num_blocks);
+		auto d_input = cuda::memory::make_unique<float[]>(device, input_size);
+		auto d_output = cuda::memory::make_unique<float[]>(device, num_blocks);
 			// Note: We won't actually be checking the output...
-		auto d_timers = cuda::memory::device::make_unique<clock_t []>(device, num_timers);
+		auto d_timers = cuda::memory::make_unique<clock_t []>(device, num_timers);
 		cuda::memory::copy(d_input.get(), input.get(), input_size * sizeof(float));
 
 		auto launch_config = cuda::launch_config_builder()
diff --git a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
index cfb771fb..fb861605 100644
--- a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
+++ b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu
@@ -43,7 +43,7 @@ int main(int, char **)
 	cuda::device::current::set_to_default();
 	auto device = cuda::device::current::get();
 
-	auto d_ptr = cuda::memory::device::make_unique<int[]>(device, N);
+	auto d_ptr = cuda::memory::make_unique<int[]>(device, N);
 	auto h_ptr = cuda::memory::host::make_unique<int[]>(N);
 
 	std::cout << "Generating data on CPU\n";
diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
index e0179001..7592441d 100644
--- a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
+++ b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@@ -176,8 +176,8 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p
 
     for (auto device : cuda::devices()) {
         streams.push_back(device.create_stream(cuda::stream::async));
-        buffers.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
-        buffersD2D.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
+        buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
+        buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
         start.push_back(device.create_event());
         stop.push_back(device.create_event());
     }
@@ -308,8 +308,8 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p)
     for (auto device : cuda::devices()) {
         streams_0.push_back(device.create_stream(cuda::stream::async));
         streams_1.push_back(device.create_stream(cuda::stream::async));
-        buffers.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
-        buffersD2D.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
+        buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
+        buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
         start.push_back(device.create_event());
         stop.push_back(device.create_event());
     }
@@ -417,8 +417,8 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer
 
     for(auto device : cuda::devices()) {
         streams.push_back(device.create_stream(cuda::stream::async));
-        buffers.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
-        buffersD2D.push_back(cuda::memory::device::make_unique<int[]>(device, numElems));
+        buffers.push_back(cuda::memory::make_unique<int[]>(device, numElems));
+        buffersD2D.push_back(cuda::memory::make_unique<int[]>(device, numElems));
         start.push_back(device.create_event());
         stop.push_back(device.create_event());
     }
diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
index 65123690..25151826 100644
--- a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
+++ b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp
@@ -147,9 +147,9 @@ int main(int argc, char** argv)
 	std::generate_n(h_B.get(), N, generator);
 
     // Allocate vectors in device memory
-	auto d_A = cuda::memory::device::make_unique<float[]>(device, N);
-	auto d_B = cuda::memory::device::make_unique<float[]>(device, N);
-	auto d_C = cuda::memory::device::make_unique<float[]>(device, N);
+	auto d_A = cuda::memory::make_unique<float[]>(device, N);
+	auto d_B = cuda::memory::make_unique<float[]>(device, N);
+	auto d_C = cuda::memory::make_unique<float[]>(device, N);
 
 
 	cuda::memory::async::copy(d_A.get(), h_A.get(), size, stream);
diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
index a9bf5b66..e68c5568 100644
--- a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
+++ b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu
@@ -115,8 +115,8 @@ void run_simple_streams_example(
 
 	// allocate device memory
 	// pointers to data and init value in the device memory
-	auto d_a = cuda::memory::device::make_unique<int[]>(device, params.n);
-	auto d_c = cuda::memory::device::make_unique<int>(device);
+	auto d_a = cuda::memory::make_unique<int[]>(device, params.n);
+	auto d_c = cuda::memory::make_unique<int>(device);
 	cuda::memory::copy_single(d_c.get(), &c);
 
 	std::cout << "\nStarting Test\n";
diff --git a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
index 1104d388..bf8e24f1 100644
--- a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
+++ b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu
@@ -43,9 +43,9 @@ int main()
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
 	auto device = cuda::device::current::get();
-	auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
 
 	cuda::memory::copy(d_A.get(), h_A.get(), size);
 	cuda::memory::copy(d_B.get(), h_B.get(), size);
diff --git a/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp b/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp
index 42c784d9..5269d473 100644
--- a/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp
+++ b/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp
@@ -68,9 +68,9 @@ int main(void)
 	std::generate(h_A.get(), h_A.get() + numElements, generator);
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
-	auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
 
 	cuda::memory::copy(d_A.get(), h_A.get(), size);
 	cuda::memory::copy(d_B.get(), h_B.get(), size);
diff --git a/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp b/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp
index 4dfa6088..7c23f0b0 100644
--- a/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp
+++ b/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp
@@ -108,9 +108,9 @@ int main(void)
 	std::generate(h_A.get(), h_A.get() + numElements, generator);
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
-	auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
 
 	cuda::memory::copy(d_A.get(), h_A.get(), size);
 	cuda::memory::copy(d_B.get(), h_B.get(), size);
diff --git a/examples/other/io_compute_overlap_with_streams.cu b/examples/other/io_compute_overlap_with_streams.cu
index 652d209e..8305387f 100644
--- a/examples/other/io_compute_overlap_with_streams.cu
+++ b/examples/other/io_compute_overlap_with_streams.cu
@@ -71,9 +71,9 @@ std::vector<buffer_set_t> generate_buffers(
                 cuda::memory::host::make_unique<element_t[]>(num_elements),
                 cuda::memory::host::make_unique<element_t[]>(num_elements),
                 cuda::memory::host::make_unique<element_t[]>(num_elements),
-                cuda::memory::device::make_unique<element_t[]>(device, num_elements),
-                cuda::memory::device::make_unique<element_t[]>(device, num_elements),
-                cuda::memory::device::make_unique<element_t[]>(device, num_elements)
+                cuda::memory::make_unique<element_t[]>(device, num_elements),
+                cuda::memory::make_unique<element_t[]>(device, num_elements),
+                cuda::memory::make_unique<element_t[]>(device, num_elements)
             };
         }
     );
diff --git a/examples/other/jitify/jitify.cpp b/examples/other/jitify/jitify.cpp
index 395b45ca..380fa56a 100644
--- a/examples/other/jitify/jitify.cpp
+++ b/examples/other/jitify/jitify.cpp
@@ -156,7 +156,7 @@ void my_kernel(T* data) {
 	// TODO: A kernel::get(const module_t& module, const char* mangled_name function)
 	auto kernel = module.get_kernel(mangled_kernel_name);
 
-	auto d_data = cuda::memory::device::make_unique<T>(device);
+	auto d_data = cuda::memory::make_unique<T>(device);
 	T h_data = 5;
 	cuda::memory::copy_single<T>(d_data.get(), &h_data);
 
@@ -242,8 +242,8 @@ void my_kernel2(float const* indata, float* outdata) {
 	auto my_kernel1 = module.get_kernel(mangled_kernel_names[0]);
 	auto my_kernel2 = module.get_kernel(mangled_kernel_names[1]);
 
-	auto indata = cuda::memory::device::make_unique<T>(device);
-	auto outdata = cuda::memory::device::make_unique<T>(device);
+	auto indata = cuda::memory::make_unique<T>(device);
+	auto outdata = cuda::memory::make_unique<T>(device);
 	T inval = 3.14159f;
 	cuda::memory::copy_single<T>(indata.get(), &inval);
 
@@ -308,7 +308,7 @@ __global__ void constant_test(int *x) {
 	cuda::memory::copy(a, &inval[0]);
 	cuda::memory::copy(b_a, &inval[1]);
 	cuda::memory::copy(c_b_a, &inval[2]);
-	auto outdata = cuda::memory::device::make_unique<int[]>(device, n_const);
+	auto outdata = cuda::memory::make_unique<int[]>(device, n_const);
 	auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
 	cuda::launch(kernel, launch_config, outdata.get());
 	int outval[n_const];
@@ -342,7 +342,7 @@ bool test_constant_2()
 	int inval[] = {3, 5, 9};
 	cuda::memory::copy(anon_b_a, inval);
 	auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point());
-	auto outdata = cuda::memory::device::make_unique<int[]>(device, n_const);
+	auto outdata = cuda::memory::make_unique<int[]>(device, n_const);
 	cuda::launch(kernel, launch_config, outdata.get());
 	int outval[n_const];
 	auto ptr = outdata.get();
diff --git a/examples/other/vectorAdd_profiled.cu b/examples/other/vectorAdd_profiled.cu
index 449b2005..6cf838c0 100644
--- a/examples/other/vectorAdd_profiled.cu
+++ b/examples/other/vectorAdd_profiled.cu
@@ -48,9 +48,9 @@ int main()
 	std::generate(h_B.get(), h_B.get() + numElements, generator);
 
 	auto device = cuda::device::current::get();
-	auto d_A = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_B = cuda::memory::device::make_unique<float[]>(device, numElements);
-	auto d_C = cuda::memory::device::make_unique<float[]>(device, numElements);
+	auto d_A = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_B = cuda::memory::make_unique<float[]>(device, numElements);
+	auto d_C = cuda::memory::make_unique<float[]>(device, numElements);
 
 	cuda::memory::copy(d_A.get(), h_A.get(), size);
 	cuda::memory::copy(d_B.get(), h_B.get(), size);
diff --git a/src/cuda/api/unique_ptr.hpp b/src/cuda/api/unique_ptr.hpp
index 0e4ae198..0e3222a6 100644
--- a/src/cuda/api/unique_ptr.hpp
+++ b/src/cuda/api/unique_ptr.hpp
@@ -127,6 +127,27 @@ inline unique_ptr<T> make_unique();
 
 } // namespace device
 
+/// See @ref `device::make_unique(const context_t& context, size_t num_elements)`
+template<typename T>
+inline device::unique_ptr<T> make_unique(const context_t& context, size_t num_elements)
+{
+	return device::make_unique<T>(context, num_elements);
+}
+
+/// See @ref `device::make_unique(const device_t& device, size_t num_elements)`
+template<typename T>
+inline device::unique_ptr<T> make_unique(const device_t& device, size_t num_elements)
+{
+	return device::make_unique<T>(device, num_elements);
+}
+
+/// See @ref `device::make_unique(const device_t& device)`
+template<typename T>
+inline device::unique_ptr<T> make_unique(const device_t& device)
+{
+	return device::make_unique<T>(device);
+}
+
 namespace host {
 
 template<typename T>