From d7512ab144050b3c910d22c1a5da69821ffe085e Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Mon, 12 Feb 2024 00:44:17 +0200 Subject: [PATCH] Fixes #585: Can now use `cuda::memory::make_unique()` and have it default to device-global memory --- examples/by_api_module/unified_addressing.cpp | 4 ++-- .../asyncAPI/asyncAPI.cu | 2 +- .../binaryPartitionCG/binaryPartitionCG.cu | 6 +++--- .../clock_nvrtc/clock.cpp | 6 +++--- .../inlinePTX/inlinePTX.cu | 2 +- .../p2pBandwidthLatencyTest.cu | 12 +++++------ .../simpleDrvRuntimePTX.cpp | 6 +++--- .../simpleStreams/simpleStreams.cu | 4 ++-- .../vectorAdd/vectorAdd.cu | 6 +++--- .../vectorAdd_nvrtc/vectorAdd_nvrtc.cpp | 6 +++--- .../vectorAdd_ptx/vectorAdd_ptx.cpp | 6 +++--- .../other/io_compute_overlap_with_streams.cu | 6 +++--- examples/other/jitify/jitify.cpp | 10 ++++----- examples/other/vectorAdd_profiled.cu | 6 +++--- src/cuda/api/unique_ptr.hpp | 21 +++++++++++++++++++ 15 files changed, 62 insertions(+), 41 deletions(-) diff --git a/examples/by_api_module/unified_addressing.cpp b/examples/by_api_module/unified_addressing.cpp index b5d4c942..1cc545c0 100644 --- a/examples/by_api_module/unified_addressing.cpp +++ b/examples/by_api_module/unified_addressing.cpp @@ -29,8 +29,8 @@ void pointer_properties(const cuda::device_t& device) cuda::context::create(device) }; cuda::memory::device::unique_ptr regions[2] = { - cuda::memory::device::make_unique(contexts[0], fixed_size), - cuda::memory::device::make_unique(contexts[1], fixed_size) + cuda::memory::make_unique(contexts[0], fixed_size), + cuda::memory::make_unique(contexts[1], fixed_size) }; void* raw_pointers[2] = { regions[0].get(), diff --git a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu index 71dd0ebc..996f29b2 100644 --- a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu +++ b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu @@ -59,7 +59,7 @@ int main(int, char **) auto a = cuda::memory::host::make_unique(n); cuda::memory::host::zero(a.get(), num_bytes); - auto d_a = cuda::memory::device::make_unique(device, n); + auto d_a = cuda::memory::make_unique(device, n); auto launch_config = cuda::launch_config_builder() .overall_size(n) diff --git a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu index 314fda6c..eaa1412b 100644 --- a/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu +++ b/examples/modified_cuda_samples/binaryPartitionCG/binaryPartitionCG.cu @@ -105,9 +105,9 @@ int main(int argc, const char **argv) auto stream = device.create_stream(cuda::stream::async); // Note: With CUDA 11, we could allocate these asynchronously on the stream - auto d_inputArr = cuda::memory::device::make_unique(device, arrSize); - auto d_numOfOdds = cuda::memory::device::make_unique(device); - auto d_sumOfOddEvenElems = cuda::memory::device::make_unique(device, 2); + auto d_inputArr = cuda::memory::make_unique(device, arrSize); + auto d_numOfOdds = cuda::memory::make_unique(device); + auto d_sumOfOddEvenElems = cuda::memory::make_unique(device, 2); // Note: There's some code repetition here; unique pointers don't also keep track of the allocated size. // Unfortunately, the standard library does not offer an owning dynamically-allocated memory region diff --git a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp index c933f7d8..633d37dc 100644 --- a/examples/modified_cuda_samples/clock_nvrtc/clock.cpp +++ b/examples/modified_cuda_samples/clock_nvrtc/clock.cpp @@ -154,10 +154,10 @@ int main() { const auto dynamic_shared_mem_size = sizeof(float) * 2 * num_threads_per_block; - auto d_input = cuda::memory::device::make_unique(device, input_size); - auto d_output = cuda::memory::device::make_unique(device, num_blocks); + auto d_input = cuda::memory::make_unique(device, input_size); + auto d_output = cuda::memory::make_unique(device, num_blocks); // Note: We won't actually be checking the output... - auto d_timers = cuda::memory::device::make_unique(device, num_timers); + auto d_timers = cuda::memory::make_unique(device, num_timers); cuda::memory::copy(d_input.get(), input.get(), input_size * sizeof(float)); auto launch_config = cuda::launch_config_builder() diff --git a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu index cfb771fb..fb861605 100644 --- a/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu +++ b/examples/modified_cuda_samples/inlinePTX/inlinePTX.cu @@ -43,7 +43,7 @@ int main(int, char **) cuda::device::current::set_to_default(); auto device = cuda::device::current::get(); - auto d_ptr = cuda::memory::device::make_unique(device, N); + auto d_ptr = cuda::memory::make_unique(device, N); auto h_ptr = cuda::memory::host::make_unique(N); std::cout << "Generating data on CPU\n"; diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu index e0179001..7592441d 100644 --- a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu +++ b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu @@ -176,8 +176,8 @@ void outputBandwidthMatrix(P2PEngine mechanism, bool test_p2p, P2PDataTransfer p for (auto device : cuda::devices()) { streams.push_back(device.create_stream(cuda::stream::async)); - buffers.push_back(cuda::memory::device::make_unique(device, numElems)); - buffersD2D.push_back(cuda::memory::device::make_unique(device, numElems)); + buffers.push_back(cuda::memory::make_unique(device, numElems)); + buffersD2D.push_back(cuda::memory::make_unique(device, numElems)); start.push_back(device.create_event()); stop.push_back(device.create_event()); } @@ -308,8 +308,8 @@ void outputBidirectionalBandwidthMatrix(P2PEngine p2p_mechanism, bool test_p2p) for (auto device : cuda::devices()) { streams_0.push_back(device.create_stream(cuda::stream::async)); streams_1.push_back(device.create_stream(cuda::stream::async)); - buffers.push_back(cuda::memory::device::make_unique(device, numElems)); - buffersD2D.push_back(cuda::memory::device::make_unique(device, numElems)); + buffers.push_back(cuda::memory::make_unique(device, numElems)); + buffersD2D.push_back(cuda::memory::make_unique(device, numElems)); start.push_back(device.create_event()); stop.push_back(device.create_event()); } @@ -417,8 +417,8 @@ void outputLatencyMatrix(P2PEngine p2p_mechanism, bool test_p2p, P2PDataTransfer for(auto device : cuda::devices()) { streams.push_back(device.create_stream(cuda::stream::async)); - buffers.push_back(cuda::memory::device::make_unique(device, numElems)); - buffersD2D.push_back(cuda::memory::device::make_unique(device, numElems)); + buffers.push_back(cuda::memory::make_unique(device, numElems)); + buffersD2D.push_back(cuda::memory::make_unique(device, numElems)); start.push_back(device.create_event()); stop.push_back(device.create_event()); } diff --git a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp index 65123690..25151826 100644 --- a/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp +++ b/examples/modified_cuda_samples/simpleDrvRuntimePTX/simpleDrvRuntimePTX.cpp @@ -147,9 +147,9 @@ int main(int argc, char** argv) std::generate_n(h_B.get(), N, generator); // Allocate vectors in device memory - auto d_A = cuda::memory::device::make_unique(device, N); - auto d_B = cuda::memory::device::make_unique(device, N); - auto d_C = cuda::memory::device::make_unique(device, N); + auto d_A = cuda::memory::make_unique(device, N); + auto d_B = cuda::memory::make_unique(device, N); + auto d_C = cuda::memory::make_unique(device, N); cuda::memory::async::copy(d_A.get(), h_A.get(), size, stream); diff --git a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu index a9bf5b66..e68c5568 100644 --- a/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu +++ b/examples/modified_cuda_samples/simpleStreams/simpleStreams.cu @@ -115,8 +115,8 @@ void run_simple_streams_example( // allocate device memory // pointers to data and init value in the device memory - auto d_a = cuda::memory::device::make_unique(device, params.n); - auto d_c = cuda::memory::device::make_unique(device); + auto d_a = cuda::memory::make_unique(device, params.n); + auto d_c = cuda::memory::make_unique(device); cuda::memory::copy_single(d_c.get(), &c); std::cout << "\nStarting Test\n"; diff --git a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu index 1104d388..bf8e24f1 100644 --- a/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu +++ b/examples/modified_cuda_samples/vectorAdd/vectorAdd.cu @@ -43,9 +43,9 @@ int main() std::generate(h_B.get(), h_B.get() + numElements, generator); auto device = cuda::device::current::get(); - auto d_A = cuda::memory::device::make_unique(device, numElements); - auto d_B = cuda::memory::device::make_unique(device, numElements); - auto d_C = cuda::memory::device::make_unique(device, numElements); + auto d_A = cuda::memory::make_unique(device, numElements); + auto d_B = cuda::memory::make_unique(device, numElements); + auto d_C = cuda::memory::make_unique(device, numElements); cuda::memory::copy(d_A.get(), h_A.get(), size); cuda::memory::copy(d_B.get(), h_B.get(), size); diff --git a/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp b/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp index 42c784d9..5269d473 100644 --- a/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp +++ b/examples/modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp @@ -68,9 +68,9 @@ int main(void) std::generate(h_A.get(), h_A.get() + numElements, generator); std::generate(h_B.get(), h_B.get() + numElements, generator); - auto d_A = cuda::memory::device::make_unique(device, numElements); - auto d_B = cuda::memory::device::make_unique(device, numElements); - auto d_C = cuda::memory::device::make_unique(device, numElements); + auto d_A = cuda::memory::make_unique(device, numElements); + auto d_B = cuda::memory::make_unique(device, numElements); + auto d_C = cuda::memory::make_unique(device, numElements); cuda::memory::copy(d_A.get(), h_A.get(), size); cuda::memory::copy(d_B.get(), h_B.get(), size); diff --git a/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp b/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp index 4dfa6088..7c23f0b0 100644 --- a/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp +++ b/examples/modified_cuda_samples/vectorAdd_ptx/vectorAdd_ptx.cpp @@ -108,9 +108,9 @@ int main(void) std::generate(h_A.get(), h_A.get() + numElements, generator); std::generate(h_B.get(), h_B.get() + numElements, generator); - auto d_A = cuda::memory::device::make_unique(device, numElements); - auto d_B = cuda::memory::device::make_unique(device, numElements); - auto d_C = cuda::memory::device::make_unique(device, numElements); + auto d_A = cuda::memory::make_unique(device, numElements); + auto d_B = cuda::memory::make_unique(device, numElements); + auto d_C = cuda::memory::make_unique(device, numElements); cuda::memory::copy(d_A.get(), h_A.get(), size); cuda::memory::copy(d_B.get(), h_B.get(), size); diff --git a/examples/other/io_compute_overlap_with_streams.cu b/examples/other/io_compute_overlap_with_streams.cu index 652d209e..8305387f 100644 --- a/examples/other/io_compute_overlap_with_streams.cu +++ b/examples/other/io_compute_overlap_with_streams.cu @@ -71,9 +71,9 @@ std::vector generate_buffers( cuda::memory::host::make_unique(num_elements), cuda::memory::host::make_unique(num_elements), cuda::memory::host::make_unique(num_elements), - cuda::memory::device::make_unique(device, num_elements), - cuda::memory::device::make_unique(device, num_elements), - cuda::memory::device::make_unique(device, num_elements) + cuda::memory::make_unique(device, num_elements), + cuda::memory::make_unique(device, num_elements), + cuda::memory::make_unique(device, num_elements) }; } ); diff --git a/examples/other/jitify/jitify.cpp b/examples/other/jitify/jitify.cpp index 395b45ca..380fa56a 100644 --- a/examples/other/jitify/jitify.cpp +++ b/examples/other/jitify/jitify.cpp @@ -156,7 +156,7 @@ void my_kernel(T* data) { // TODO: A kernel::get(const module_t& module, const char* mangled_name function) auto kernel = module.get_kernel(mangled_kernel_name); - auto d_data = cuda::memory::device::make_unique(device); + auto d_data = cuda::memory::make_unique(device); T h_data = 5; cuda::memory::copy_single(d_data.get(), &h_data); @@ -242,8 +242,8 @@ void my_kernel2(float const* indata, float* outdata) { auto my_kernel1 = module.get_kernel(mangled_kernel_names[0]); auto my_kernel2 = module.get_kernel(mangled_kernel_names[1]); - auto indata = cuda::memory::device::make_unique(device); - auto outdata = cuda::memory::device::make_unique(device); + auto indata = cuda::memory::make_unique(device); + auto outdata = cuda::memory::make_unique(device); T inval = 3.14159f; cuda::memory::copy_single(indata.get(), &inval); @@ -308,7 +308,7 @@ __global__ void constant_test(int *x) { cuda::memory::copy(a, &inval[0]); cuda::memory::copy(b_a, &inval[1]); cuda::memory::copy(c_b_a, &inval[2]); - auto outdata = cuda::memory::device::make_unique(device, n_const); + auto outdata = cuda::memory::make_unique(device, n_const); auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point()); cuda::launch(kernel, launch_config, outdata.get()); int outval[n_const]; @@ -342,7 +342,7 @@ bool test_constant_2() int inval[] = {3, 5, 9}; cuda::memory::copy(anon_b_a, inval); auto launch_config = cuda::launch_configuration_t(cuda::grid::composite_dimensions_t::point()); - auto outdata = cuda::memory::device::make_unique(device, n_const); + auto outdata = cuda::memory::make_unique(device, n_const); cuda::launch(kernel, launch_config, outdata.get()); int outval[n_const]; auto ptr = outdata.get(); diff --git a/examples/other/vectorAdd_profiled.cu b/examples/other/vectorAdd_profiled.cu index 449b2005..6cf838c0 100644 --- a/examples/other/vectorAdd_profiled.cu +++ b/examples/other/vectorAdd_profiled.cu @@ -48,9 +48,9 @@ int main() std::generate(h_B.get(), h_B.get() + numElements, generator); auto device = cuda::device::current::get(); - auto d_A = cuda::memory::device::make_unique(device, numElements); - auto d_B = cuda::memory::device::make_unique(device, numElements); - auto d_C = cuda::memory::device::make_unique(device, numElements); + auto d_A = cuda::memory::make_unique(device, numElements); + auto d_B = cuda::memory::make_unique(device, numElements); + auto d_C = cuda::memory::make_unique(device, numElements); cuda::memory::copy(d_A.get(), h_A.get(), size); cuda::memory::copy(d_B.get(), h_B.get(), size); diff --git a/src/cuda/api/unique_ptr.hpp b/src/cuda/api/unique_ptr.hpp index 0e4ae198..0e3222a6 100644 --- a/src/cuda/api/unique_ptr.hpp +++ b/src/cuda/api/unique_ptr.hpp @@ -127,6 +127,27 @@ inline unique_ptr make_unique(); } // namespace device +/// See @ref `device::make_unique(const context_t& context, size_t num_elements)` +template +inline device::unique_ptr make_unique(const context_t& context, size_t num_elements) +{ + return device::make_unique(context, num_elements); +} + +/// See @ref `device::make_unique(const device_t& device, size_t num_elements)` +template +inline device::unique_ptr make_unique(const device_t& device, size_t num_elements) +{ + return device::make_unique(device, num_elements); +} + +/// See @ref `device::make_unique(const device_t& device)` +template +inline device::unique_ptr make_unique(const device_t& device) +{ + return device::make_unique(device); +} + namespace host { template