From f6e32544a36ef03fa087ba4665334dc311feae34 Mon Sep 17 00:00:00 2001 From: Wouter Bijlsma Date: Thu, 25 Apr 2024 13:45:31 +0200 Subject: [PATCH] Add NANOVDB_USE_SYNC_CUDA_MALLOC define to force sync CUDA malloc In virtualized environments that slice up the GPU and share it between instances as vGPU's, GPU unified memory is usually disabled out of security considerations. Asynchronous CUDA malloc/free depends on GPU unified memory, so before, it was not possible to deploy and run NanoVDB code in such environments. This commit adds macros CUDA_MALLOC and CUDA_FREE and replaces all CUDA alloc/free calls with these macros. CUDA_MALLOC and CUDA_FREE expand to asynchronous CUDA malloc & free if the following two conditions are met: - CUDA version needs to be >= 11.2 as this is the first version that supports cudaMallocAsync/cudaMallocFree - NANOVDB_USE_SYNC_CUDA_MALLOC needs to undefined In all other cases, CUDA_MALLOC and CUDA_FREE expand to synchronous cudaMalloc/cudaFree. Since NanoVDB is distributed as header-only, setting the NANOVDB_USE_SYNC_CUDA_MALLOC flag should be handled by the project's build system itself. --- .../nanovdb/util/cuda/CudaAddBlindData.cuh | 4 +- nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h | 6 +-- .../nanovdb/util/cuda/CudaGridChecksum.cuh | 32 +++++++------- nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh | 8 ++-- nanovdb/nanovdb/util/cuda/CudaGridStats.cuh | 4 +- nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh | 8 ++-- nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh | 4 +- .../nanovdb/util/cuda/CudaPointsToGrid.cuh | 4 +- .../nanovdb/util/cuda/CudaSignedFloodFill.cuh | 4 +- nanovdb/nanovdb/util/cuda/CudaUtils.h | 43 +++++++++++++------ 10 files changed, 66 insertions(+), 51 deletions(-) diff --git a/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh b/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh index c750412458..ad6a447764 100644 --- a/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh @@ -62,7 +62,7 @@ cudaAddBlindData(const NanoGrid *d_grid, // extract byte sizes of the grid, blind meta data and blind data enum {GRID=0, META=1, DATA=2, CHECKSUM=3}; uint64_t tmp[4], *d_tmp; - cudaCheck(cudaMallocAsync((void**)&d_tmp, 4*sizeof(uint64_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_tmp, 4*sizeof(uint64_t), stream)); cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) { if (auto count = d_grid->blindDataCount()) { d_tmp[GRID] = PtrDiff(&d_grid->blindMetaData(0), d_grid); @@ -114,7 +114,7 @@ cudaAddBlindData(const NanoGrid *d_grid, for (uint32_t i=0, n=grid.mBlindMetadataCount-1; imDataOffset += sizeof(GridBlindMetaData); grid.mGridSize += sizeof(GridBlindMetaData) + meta->blindDataSize();// expansion with 32 byte alignment }); cudaCheckError(); - cudaCheck(cudaFreeAsync(d_tmp, stream)); + cudaCheck(CUDA_FREE(d_tmp, stream)); GridChecksum cs(tmp[CHECKSUM]); cudaGridChecksum(reinterpret_cast(d_data), cs.mode()); diff --git a/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h b/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h index 4b9820771d..3a3a450bd6 100644 --- a/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h +++ b/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h @@ -153,7 +153,7 @@ inline void CudaDeviceBuffer::init(uint64_t size, bool host, void* stream) cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned checkPtr(mCpuData, "CudaDeviceBuffer::init: failed to allocate host buffer"); } else { - cudaCheck(cudaMallocAsync((void**)&mGpuData, size, reinterpret_cast(stream))); // un-managed memory on the device, always 32B aligned! + cudaCheck(CUDA_MALLOC((void**)&mGpuData, size, reinterpret_cast(stream))); // un-managed memory on the device, always 32B aligned! checkPtr(mGpuData, "CudaDeviceBuffer::init: failed to allocate device buffer"); } mSize = size; @@ -163,7 +163,7 @@ inline void CudaDeviceBuffer::deviceUpload(void* stream, bool sync) const { checkPtr(mCpuData, "uninitialized cpu data"); if (mGpuData == nullptr) { - cudaCheck(cudaMallocAsync((void**)&mGpuData, mSize, reinterpret_cast(stream))); // un-managed memory on the device, always 32B aligned! + cudaCheck(CUDA_MALLOC((void**)&mGpuData, mSize, reinterpret_cast(stream))); // un-managed memory on the device, always 32B aligned! } checkPtr(mGpuData, "uninitialized gpu data"); cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, reinterpret_cast(stream))); @@ -183,7 +183,7 @@ inline void CudaDeviceBuffer::deviceDownload(void* stream, bool sync) const inline void CudaDeviceBuffer::clear(void *stream) { - if (mGpuData) cudaCheck(cudaFreeAsync(mGpuData, reinterpret_cast(stream))); + if (mGpuData) cudaCheck(CUDA_FREE(mGpuData, reinterpret_cast(stream))); if (mCpuData) cudaCheck(cudaFreeHost(mCpuData)); mCpuData = mGpuData = nullptr; mSize = 0; diff --git a/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh b/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh index e3ae9a941f..c1d6231636 100644 --- a/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh @@ -67,7 +67,7 @@ __global__ void checksumKernel(const T *d_data, uint32_t* d_blockCRC, uint32_t b inline uint32_t* cudaCreateLut(cudaStream_t stream = 0) { uint32_t *d_lut; - cudaCheck(cudaMallocAsync((void**)&d_lut, 256*sizeof(uint32_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_lut, 256*sizeof(uint32_t), stream)); initLutKernel<<<1, 256, 0, stream>>>(d_lut); cudaCheckError(); return d_lut; @@ -93,10 +93,10 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C uint8_t *d_begin = reinterpret_cast(d_gridData); uint32_t *d_lut = crc32::cudaCreateLut(stream);// allocate and generate device LUT for CRC32 uint64_t size[2], *d_size;// {total size of grid, partial size for first checksum} - cudaCheck(cudaMallocAsync((void**)&d_size, 2*sizeof(uint64_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_size, 2*sizeof(uint64_t), stream)); // Compute CRC32 checksum of GridData, TreeData, RootData (+tiles), but exclude GridData::mMagic and GridData::mChecksum - cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) { + cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__ (size_t) { d_size[0] = d_gridData->mGridSize; uint8_t *d_mid = d_gridData->template nodePtr<2>(); if (d_mid == nullptr) {// no upper nodes @@ -112,7 +112,7 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C }); cudaCheckError(); cudaCheck(cudaMemcpyAsync(size, d_size, 2*sizeof(uint64_t), cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaFreeAsync(d_size, stream)); + cudaCheck(CUDA_FREE(d_size, stream)); if (mode != ChecksumMode::Full || size[0] == size[1]) return ChecksumMode::Partial; @@ -120,7 +120,7 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C const uint8_t *d_mid = d_begin + size[1], *d_end = d_begin + size[0]; uint32_t *d_checksums;// 4096 byte chunks const uint64_t checksumCount = (d_end - d_mid) >> NANOVDB_CRC32_LOG2_BLOCK_SIZE;// 4 KB (4096 byte) - cudaCheck(cudaMallocAsync((void**)&d_checksums, checksumCount*sizeof(uint32_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_checksums, checksumCount*sizeof(uint32_t), stream)); cudaLambdaKernel<<>>(checksumCount, [=] __device__(size_t tid) { uint32_t size = 1< *d_grid, ChecksumMode mode inline GridChecksum cudaGetGridChecksum(GridData *d_gridData, cudaStream_t stream = 0) { uint64_t checksum, *d_checksum; - cudaCheck(cudaMallocAsync((void**)&d_checksum, sizeof(uint64_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_checksum, sizeof(uint64_t), stream)); cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {*d_checksum = d_gridData->mChecksum;}); cudaCheckError(); cudaCheck(cudaMemcpyAsync(&checksum, d_checksum, sizeof(uint64_t), cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaFreeAsync(d_checksum, stream)); + cudaCheck(CUDA_FREE(d_checksum, stream)); return GridChecksum(checksum);; } @@ -172,7 +172,7 @@ void cudaGridChecksum(NanoGrid *d_grid, ChecksumMode mode = ChecksumMode uint32_t *d_lut = crc32::cudaCreateLut(stream);// allocate and generate device LUT for CRC32 uint64_t size[2], *d_size; - cudaCheck(cudaMallocAsync((void**)&d_size, 2*sizeof(uint64_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_size, 2*sizeof(uint64_t), stream)); cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) { d_size[0] = d_grid->gridSize(); d_size[1] = d_grid->memUsage() + d_grid->tree().memUsage() + d_grid->tree().root().memUsage(); @@ -188,15 +188,15 @@ void cudaGridChecksum(NanoGrid *d_grid, ChecksumMode mode = ChecksumMode // Get node counts uint32_t nodeCount[3], *d_nodeCount, *d_checksums, *d_ptr; - cudaCheck(cudaMallocAsync((void**)&d_nodeCount, 3*sizeof(uint32_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_nodeCount, 3*sizeof(uint32_t), stream)); cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) { auto &tree = d_grid->tree(); for (int i = 0; i < 3; ++i) d_nodeCount[i] = tree.nodeCount(i); }); cudaCheckError(); cudaCheck(cudaMemcpyAsync(nodeCount, d_nodeCount, 3*sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaFreeAsync(d_nodeCount, stream)); - cudaCheck(cudaMallocAsync((void**)&d_checksums, (nodeCount[0]+nodeCount[1]+nodeCount[2])*sizeof(uint32_t), stream)); + cudaCheck(CUDA_FREE(d_nodeCount, stream)); + cudaCheck(CUDA_MALLOC((void**)&d_checksums, (nodeCount[0]+nodeCount[1]+nodeCount[2])*sizeof(uint32_t), stream)); auto nodeMgrHandle = cudaCreateNodeManager(d_grid, CudaDeviceBuffer(), stream); auto *d_nodeMgr = nodeMgrHandle.template deviceMgr(); @@ -232,9 +232,9 @@ void cudaGridChecksum(NanoGrid *d_grid, ChecksumMode mode = ChecksumMode }); cudaCheckError(); - cudaCheck(cudaFreeAsync(d_size, stream)); - cudaCheck(cudaFreeAsync(d_checksums, stream)); - cudaCheck(cudaFreeAsync(d_lut, stream)); + cudaCheck(CUDA_FREE(d_size, stream)); + cudaCheck(CUDA_FREE(d_checksums, stream)); + cudaCheck(CUDA_FREE(d_lut, stream)); }// cudaGridChecksum #endif diff --git a/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh b/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh index 5446c56231..dd547fb1d4 100644 --- a/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh @@ -71,7 +71,7 @@ cudaSplitGridHandles(const GridHandle &handle, const BufferT* other = n if (ptr == nullptr) return VectorT>(); VectorT> handles(handle.gridCount()); bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed - cudaCheck(cudaMallocAsync((void**)&d_dirty, sizeof(bool), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_dirty, sizeof(bool), stream)); for (uint32_t n=0; n(buffer.deviceData()); @@ -84,7 +84,7 @@ cudaSplitGridHandles(const GridHandle &handle, const BufferT* other = n handles[n] = GridHandle(std::move(buffer)); ptr += handle.gridSize(n); } - cudaCheck(cudaFreeAsync(d_dirty, stream)); + cudaCheck(CUDA_FREE(d_dirty, stream)); return std::move(handles); }// cudaSplitGridHandles @@ -106,7 +106,7 @@ cudaMergeGridHandles(const VectorT> &handles, const BufferT* auto buffer = BufferT::create(size, other, false, stream); uint8_t *dst = buffer.deviceData(); bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed - cudaCheck(cudaMallocAsync((void**)&d_dirty, sizeof(bool), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_dirty, sizeof(bool), stream)); for (auto &h : handles) { const uint8_t *src = h.deviceData(); for (uint32_t n=0; n> &handles, const BufferT* src += h.gridSize(n); } } - cudaCheck(cudaFreeAsync(d_dirty, stream)); + cudaCheck(CUDA_FREE(d_dirty, stream)); return GridHandle(std::move(buffer)); }// cudaMergeGridHandles diff --git a/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh b/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh index dcf5bfc850..0c4b970537 100644 --- a/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh @@ -210,7 +210,7 @@ void CudaGridStats::operator()(NanoGrid *d_grid, cudaStr StatsT *d_stats = nullptr; - if constexpr(StatsT::hasAverage()) cudaCheck(cudaMallocAsync((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream)); + if constexpr(StatsT::hasAverage()) cudaCheck(CUDA_MALLOC((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream)); processLeaf<<>>(d_nodeMgr, d_stats); @@ -220,7 +220,7 @@ void CudaGridStats::operator()(NanoGrid *d_grid, cudaStr processRootAndGrid<<<1, 1, 0, stream>>>(d_nodeMgr, d_stats); - if constexpr(StatsT::hasAverage()) cudaCheck(cudaFreeAsync(d_stats, stream)); + if constexpr(StatsT::hasAverage()) cudaCheck(CUDA_FREE(d_stats, stream)); } // CudaGridStats::operator()( Grid ) diff --git a/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh b/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh index 8394ecefe1..2374ba2f71 100644 --- a/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh @@ -64,7 +64,7 @@ public: /// @param srcGrid Device pointer to IndexGrid used as the source CudaIndexToGrid(const SrcGridT *d_srcGrid, cudaStream_t stream = 0); - ~CudaIndexToGrid() {cudaCheck(cudaFreeAsync(mDevNodeAcc, mStream));} + ~CudaIndexToGrid() {cudaCheck(CUDA_FREE(mDevNodeAcc, mStream));} /// @brief Toggle on and off verbose mode /// @param on if true verbose is turned on @@ -295,7 +295,7 @@ CudaIndexToGrid::CudaIndexToGrid(const SrcGridT *d_srcGrid, cudaStrea : mStream(stream), mTimer(stream) { NANOVDB_ASSERT(d_srcGrid); - cudaCheck(cudaMallocAsync((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream)); + cudaCheck(CUDA_MALLOC((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream)); cudaCpyNodeCount<<<1, 1, 0, mStream>>>(d_srcGrid, mDevNodeAcc); cudaCheckError(); cudaCheck(cudaMemcpyAsync(&mNodeAcc, mDevNodeAcc, sizeof(NodeAccessor), cudaMemcpyDeviceToHost, mStream));// mNodeAcc = *mDevNodeAcc @@ -319,7 +319,7 @@ GridHandle CudaIndexToGrid::getHandle(const typename BuildTo cudaProcessRootTiles<<>>(mDevNodeAcc, srcValues); cudaCheckError(); - cudaCheck(cudaFreeAsync(mNodeAcc.d_gridName, mStream)); + cudaCheck(CUDA_FREE(mNodeAcc.d_gridName, mStream)); if (mVerbose) mTimer.restart("Process upper internal nodes"); cudaProcessInternalNodes<<>>(mDevNodeAcc, srcValues); @@ -362,7 +362,7 @@ inline BufferT CudaIndexToGrid::getBuffer(const BufferT &pool) if (mNodeAcc.d_dstPtr == nullptr) throw std::runtime_error("Failed memory allocation on the device"); if (size_t size = mGridName.size()) { - cudaCheck(cudaMallocAsync((void**)&mNodeAcc.d_gridName, size, mStream)); + cudaCheck(CUDA_MALLOC((void**)&mNodeAcc.d_gridName, size, mStream)); cudaCheck(cudaMemcpyAsync(mNodeAcc.d_gridName, mGridName.data(), size, cudaMemcpyHostToDevice, mStream)); } else { mNodeAcc.d_gridName = nullptr; diff --git a/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh b/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh index 3d35a4b902..7ffb1db948 100644 --- a/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh @@ -38,7 +38,7 @@ cudaCreateNodeManager(const NanoGrid *d_grid, auto buffer = BufferT::create(sizeof(NodeManagerData), &pool, false, stream); auto *d_data = (NodeManagerData*)buffer.deviceData(); size_t size = 0u, *d_size; - cudaCheck(cudaMallocAsync((void**)&d_size, sizeof(size_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_size, sizeof(size_t), stream)); cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) { #ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS *d_data = NodeManagerData{NANOVDB_MAGIC_NODE, 0u, (void*)d_grid, {0u,0u,0u}}; @@ -58,7 +58,7 @@ cudaCreateNodeManager(const NanoGrid *d_grid, }); cudaCheckError(); cudaCheck(cudaMemcpyAsync(&size, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaFreeAsync(d_size, stream)); + cudaCheck(CUDA_FREE(d_size, stream)); if (size > sizeof(NodeManagerData)) { auto tmp = BufferT::create(size, &pool, false, stream);// only allocate buffer on the device cudaCheck(cudaMemcpyAsync(tmp.deviceData(), buffer.deviceData(), sizeof(NodeManagerData), cudaMemcpyDeviceToDevice, stream)); diff --git a/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh b/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh index 733dc35cb9..d926d1a344 100644 --- a/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh @@ -265,7 +265,7 @@ public: { mData.map = map; mData.flags.initMask({GridFlags::HasBBox, GridFlags::IsBreadthFirst}); - cudaCheck(cudaMallocAsync((void**)&mDeviceData, sizeof(Data), mStream)); + cudaCheck(CUDA_MALLOC((void**)&mDeviceData, sizeof(Data), mStream)); } /// @brief Default constructor @@ -276,7 +276,7 @@ public: : CudaPointsToGrid(Map(scale, trans), stream) {} /// @brief Destructor - ~CudaPointsToGrid() {cudaCheck(cudaFreeAsync(mDeviceData, mStream));} + ~CudaPointsToGrid() {cudaCheck(CUDA_FREE(mDeviceData, mStream));} /// @brief Toggle on and off verbose mode /// @param level Verbose level: 0=quiet, 1=timing, 2=benchmarking diff --git a/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh b/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh index 2f4bf203d6..eb40a35aae 100644 --- a/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh @@ -153,11 +153,11 @@ void CudaSignedFloodFill::operator()(NanoGrid *d_grid) static_assert(BuildTraits::is_float, "CudaSignedFloodFill only works on float grids"); NANOVDB_ASSERT(d_grid); uint64_t count[4], *d_count = nullptr; - cudaCheck(cudaMallocAsync((void**)&d_count, 4*sizeof(uint64_t), mStream)); + cudaCheck(CUDA_MALLOC((void**)&d_count, 4*sizeof(uint64_t), mStream)); cudaCpyNodeCount<<<1, 1, 0, mStream>>>(d_grid, d_count); cudaCheckError(); cudaCheck(cudaMemcpyAsync(&count, d_count, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, mStream)); - cudaCheck(cudaFreeAsync(d_count, mStream)); + cudaCheck(CUDA_FREE(d_count, mStream)); static const int threadsPerBlock = 128; auto blocksPerGrid = [&](size_t count)->uint32_t{return (count + (threadsPerBlock - 1)) / threadsPerBlock;}; diff --git a/nanovdb/nanovdb/util/cuda/CudaUtils.h b/nanovdb/nanovdb/util/cuda/CudaUtils.h index 40001748ee..a40b41f694 100644 --- a/nanovdb/nanovdb/util/cuda/CudaUtils.h +++ b/nanovdb/nanovdb/util/cuda/CudaUtils.h @@ -53,20 +53,35 @@ cudaCheck(cudaGetLastError()); \ } -#if CUDART_VERSION < 11020 // 11.2 introduced cudaMallocAsync and cudaFreeAsync - -/// @brief Dummy implementation of cudaMallocAsync that calls cudaMalloc -/// @param d_ptr Device pointer to allocated device memory -/// @param size Number of bytes to allocate -/// @param dummy The stream establishing the stream ordering contract and the memory pool to allocate from (ignored) -/// @return Cuda error code -inline cudaError_t cudaMallocAsync(void** d_ptr, size_t size, cudaStream_t){return cudaMalloc(d_ptr, size);} - -/// @brief Dummy implementation of cudaFreeAsync that calls cudaFree -/// @param d_ptr Device pointer that will be freed -/// @param dummy The stream establishing the stream ordering promise (ignored) -/// @return Cuda error code -inline cudaError_t cudaFreeAsync(void* d_ptr, cudaStream_t){return cudaFree(d_ptr);} +// cudaMallocAsync and cudaFreeAsync were introduced in CUDA 11.2, for older CUDA +// versions fall back to cudaMalloc and cudaFree. The fallback can also be forced +// using the USE_SYNC_CUDA_MALLOC flag. This may be useful when deploying nanoVDB +// code in virtualized environments that share the GPU between instances by slicing +// it up into vGPU's. In such environments GPU unified memory is usually disabled +// out of security considerations, which means cudaMallocAsync can not be used. +#if (CUDART_VERSION < 11020) || defined(NANOVDB_USE_SYNC_CUDA_MALLOC) + +#define CUDA_MALLOC(d_ptr, size, stream) \ + { \ + cudaMalloc((d_ptr), (size)); \ + } + +#define CUDA_FREE(d_ptr, stream) \ + { \ + cudaFree((d_ptr)); \ + } + +#else + +#define CUDA_MALLOC(d_ptr, size, stream) \ + { \ + cudaMallocAsync((d_ptr), (size), (stream)); \ + } + +#define CUDA_FREE(d_ptr, stream) \ + { \ + cudaFreeAsync((d_ptr), (stream)); \ + } #endif