Skip to content

Commit

Permalink
Add NANOVDB_USE_SYNC_CUDA_MALLOC define to force sync CUDA malloc
Browse files Browse the repository at this point in the history
In virtualized environments that slice up the GPU and share it
between instances as vGPU's, GPU unified memory is usually disabled
out of security considerations. Asynchronous CUDA malloc/free
depends on GPU unified memory, so before, it was not possible to
deploy and run NanoVDB code in such environments.

This commit adds macros CUDA_MALLOC and CUDA_FREE and replaces all
CUDA alloc/free calls with these macros. CUDA_MALLOC and CUDA_FREE
expand to asynchronous CUDA malloc & free if the following two
conditions are met:

  - CUDA version needs to be >= 11.2 as this is the first version
    that supports cudaMallocAsync/cudaMallocFree
  - NANOVDB_USE_SYNC_CUDA_MALLOC needs to undefined

In all other cases, CUDA_MALLOC and CUDA_FREE expand to synchronous
cudaMalloc/cudaFree.

Since NanoVDB is distributed as header-only, setting the
NANOVDB_USE_SYNC_CUDA_MALLOC flag should be handled by the project's
build system itself.
  • Loading branch information
w0utert committed Apr 25, 2024
1 parent 08409f0 commit f6e3254
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 51 deletions.
4 changes: 2 additions & 2 deletions nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ cudaAddBlindData(const NanoGrid<BuildT> *d_grid,
// extract byte sizes of the grid, blind meta data and blind data
enum {GRID=0, META=1, DATA=2, CHECKSUM=3};
uint64_t tmp[4], *d_tmp;
cudaCheck(cudaMallocAsync((void**)&d_tmp, 4*sizeof(uint64_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_tmp, 4*sizeof(uint64_t), stream));
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
if (auto count = d_grid->blindDataCount()) {
d_tmp[GRID] = PtrDiff(&d_grid->blindMetaData(0), d_grid);
Expand Down Expand Up @@ -114,7 +114,7 @@ cudaAddBlindData(const NanoGrid<BuildT> *d_grid,
for (uint32_t i=0, n=grid.mBlindMetadataCount-1; i<n; ++i, ++meta) meta->mDataOffset += sizeof(GridBlindMetaData);
grid.mGridSize += sizeof(GridBlindMetaData) + meta->blindDataSize();// expansion with 32 byte alignment
}); cudaCheckError();
cudaCheck(cudaFreeAsync(d_tmp, stream));
cudaCheck(CUDA_FREE(d_tmp, stream));

GridChecksum cs(tmp[CHECKSUM]);
cudaGridChecksum(reinterpret_cast<GridData*>(d_data), cs.mode());
Expand Down
6 changes: 3 additions & 3 deletions nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ inline void CudaDeviceBuffer::init(uint64_t size, bool host, void* stream)
cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
checkPtr(mCpuData, "CudaDeviceBuffer::init: failed to allocate host buffer");
} else {
cudaCheck(cudaMallocAsync((void**)&mGpuData, size, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
cudaCheck(CUDA_MALLOC((void**)&mGpuData, size, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
checkPtr(mGpuData, "CudaDeviceBuffer::init: failed to allocate device buffer");
}
mSize = size;
Expand All @@ -163,7 +163,7 @@ inline void CudaDeviceBuffer::deviceUpload(void* stream, bool sync) const
{
checkPtr(mCpuData, "uninitialized cpu data");
if (mGpuData == nullptr) {
cudaCheck(cudaMallocAsync((void**)&mGpuData, mSize, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
cudaCheck(CUDA_MALLOC((void**)&mGpuData, mSize, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
}
checkPtr(mGpuData, "uninitialized gpu data");
cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream)));
Expand All @@ -183,7 +183,7 @@ inline void CudaDeviceBuffer::deviceDownload(void* stream, bool sync) const

inline void CudaDeviceBuffer::clear(void *stream)
{
if (mGpuData) cudaCheck(cudaFreeAsync(mGpuData, reinterpret_cast<cudaStream_t>(stream)));
if (mGpuData) cudaCheck(CUDA_FREE(mGpuData, reinterpret_cast<cudaStream_t>(stream)));
if (mCpuData) cudaCheck(cudaFreeHost(mCpuData));
mCpuData = mGpuData = nullptr;
mSize = 0;
Expand Down
32 changes: 16 additions & 16 deletions nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ __global__ void checksumKernel(const T *d_data, uint32_t* d_blockCRC, uint32_t b
inline uint32_t* cudaCreateLut(cudaStream_t stream = 0)
{
uint32_t *d_lut;
cudaCheck(cudaMallocAsync((void**)&d_lut, 256*sizeof(uint32_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_lut, 256*sizeof(uint32_t), stream));
initLutKernel<<<1, 256, 0, stream>>>(d_lut);
cudaCheckError();
return d_lut;
Expand All @@ -93,10 +93,10 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C
uint8_t *d_begin = reinterpret_cast<uint8_t*>(d_gridData);
uint32_t *d_lut = crc32::cudaCreateLut(stream);// allocate and generate device LUT for CRC32
uint64_t size[2], *d_size;// {total size of grid, partial size for first checksum}
cudaCheck(cudaMallocAsync((void**)&d_size, 2*sizeof(uint64_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_size, 2*sizeof(uint64_t), stream));

// Compute CRC32 checksum of GridData, TreeData, RootData (+tiles), but exclude GridData::mMagic and GridData::mChecksum
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__ (size_t) {
d_size[0] = d_gridData->mGridSize;
uint8_t *d_mid = d_gridData->template nodePtr<2>();
if (d_mid == nullptr) {// no upper nodes
Expand All @@ -112,15 +112,15 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C
});
cudaCheckError();
cudaCheck(cudaMemcpyAsync(size, d_size, 2*sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
cudaCheck(cudaFreeAsync(d_size, stream));
cudaCheck(CUDA_FREE(d_size, stream));

if (mode != ChecksumMode::Full || size[0] == size[1]) return ChecksumMode::Partial;

// Compute CRC32 checksum of 4K block of everything remaining in the buffer, i.e. nodes and blind data
const uint8_t *d_mid = d_begin + size[1], *d_end = d_begin + size[0];
uint32_t *d_checksums;// 4096 byte chunks
const uint64_t checksumCount = (d_end - d_mid) >> NANOVDB_CRC32_LOG2_BLOCK_SIZE;// 4 KB (4096 byte)
cudaCheck(cudaMallocAsync((void**)&d_checksums, checksumCount*sizeof(uint32_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_checksums, checksumCount*sizeof(uint32_t), stream));
cudaLambdaKernel<<<numBlocks(checksumCount), mNumThreads, 0, stream>>>(checksumCount, [=] __device__(size_t tid) {
uint32_t size = 1<<NANOVDB_CRC32_LOG2_BLOCK_SIZE;
if (tid+1 == checksumCount) size += d_end - d_mid - (checksumCount<<NANOVDB_CRC32_LOG2_BLOCK_SIZE);
Expand All @@ -132,8 +132,8 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C
p[1] = crc32::checksum((const uint8_t*)d_checksums, checksumCount*sizeof(uint32_t), d_lut);
});
cudaCheckError();
cudaCheck(cudaFreeAsync(d_checksums, stream));
cudaCheck(cudaFreeAsync(d_lut, stream));
cudaCheck(CUDA_FREE(d_checksums, stream));
cudaCheck(CUDA_FREE(d_lut, stream));

return ChecksumMode::Full;
}// cudaGridChecksum
Expand All @@ -147,11 +147,11 @@ inline ChecksumMode cudaGridChecksum(NanoGrid<BuildT> *d_grid, ChecksumMode mode
inline GridChecksum cudaGetGridChecksum(GridData *d_gridData, cudaStream_t stream = 0)
{
uint64_t checksum, *d_checksum;
cudaCheck(cudaMallocAsync((void**)&d_checksum, sizeof(uint64_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_checksum, sizeof(uint64_t), stream));
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {*d_checksum = d_gridData->mChecksum;});
cudaCheckError();
cudaCheck(cudaMemcpyAsync(&checksum, d_checksum, sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
cudaCheck(cudaFreeAsync(d_checksum, stream));
cudaCheck(CUDA_FREE(d_checksum, stream));
return GridChecksum(checksum);;
}

Expand All @@ -172,7 +172,7 @@ void cudaGridChecksum(NanoGrid<ValueT> *d_grid, ChecksumMode mode = ChecksumMode

uint32_t *d_lut = crc32::cudaCreateLut(stream);// allocate and generate device LUT for CRC32
uint64_t size[2], *d_size;
cudaCheck(cudaMallocAsync((void**)&d_size, 2*sizeof(uint64_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_size, 2*sizeof(uint64_t), stream));
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
d_size[0] = d_grid->gridSize();
d_size[1] = d_grid->memUsage() + d_grid->tree().memUsage() + d_grid->tree().root().memUsage();
Expand All @@ -188,15 +188,15 @@ void cudaGridChecksum(NanoGrid<ValueT> *d_grid, ChecksumMode mode = ChecksumMode

// Get node counts
uint32_t nodeCount[3], *d_nodeCount, *d_checksums, *d_ptr;
cudaCheck(cudaMallocAsync((void**)&d_nodeCount, 3*sizeof(uint32_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_nodeCount, 3*sizeof(uint32_t), stream));
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
auto &tree = d_grid->tree();
for (int i = 0; i < 3; ++i) d_nodeCount[i] = tree.nodeCount(i);
});
cudaCheckError();
cudaCheck(cudaMemcpyAsync(nodeCount, d_nodeCount, 3*sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
cudaCheck(cudaFreeAsync(d_nodeCount, stream));
cudaCheck(cudaMallocAsync((void**)&d_checksums, (nodeCount[0]+nodeCount[1]+nodeCount[2])*sizeof(uint32_t), stream));
cudaCheck(CUDA_FREE(d_nodeCount, stream));
cudaCheck(CUDA_MALLOC((void**)&d_checksums, (nodeCount[0]+nodeCount[1]+nodeCount[2])*sizeof(uint32_t), stream));

auto nodeMgrHandle = cudaCreateNodeManager<ValueT, CudaDeviceBuffer>(d_grid, CudaDeviceBuffer(), stream);
auto *d_nodeMgr = nodeMgrHandle.template deviceMgr<ValueT>();
Expand Down Expand Up @@ -232,9 +232,9 @@ void cudaGridChecksum(NanoGrid<ValueT> *d_grid, ChecksumMode mode = ChecksumMode
});
cudaCheckError();

cudaCheck(cudaFreeAsync(d_size, stream));
cudaCheck(cudaFreeAsync(d_checksums, stream));
cudaCheck(cudaFreeAsync(d_lut, stream));
cudaCheck(CUDA_FREE(d_size, stream));
cudaCheck(CUDA_FREE(d_checksums, stream));
cudaCheck(CUDA_FREE(d_lut, stream));
}// cudaGridChecksum

#endif
Expand Down
8 changes: 4 additions & 4 deletions nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ cudaSplitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = n
if (ptr == nullptr) return VectorT<GridHandle<BufferT>>();
VectorT<GridHandle<BufferT>> handles(handle.gridCount());
bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
cudaCheck(cudaMallocAsync((void**)&d_dirty, sizeof(bool), stream));
cudaCheck(CUDA_MALLOC((void**)&d_dirty, sizeof(bool), stream));
for (uint32_t n=0; n<handle.gridCount(); ++n) {
auto buffer = BufferT::create(handle.gridSize(n), other, false, stream);
GridData *dst = reinterpret_cast<GridData*>(buffer.deviceData());
Expand All @@ -84,7 +84,7 @@ cudaSplitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = n
handles[n] = GridHandle<BufferT>(std::move(buffer));
ptr += handle.gridSize(n);
}
cudaCheck(cudaFreeAsync(d_dirty, stream));
cudaCheck(CUDA_FREE(d_dirty, stream));
return std::move(handles);
}// cudaSplitGridHandles

Expand All @@ -106,7 +106,7 @@ cudaMergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT*
auto buffer = BufferT::create(size, other, false, stream);
uint8_t *dst = buffer.deviceData();
bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
cudaCheck(cudaMallocAsync((void**)&d_dirty, sizeof(bool), stream));
cudaCheck(CUDA_MALLOC((void**)&d_dirty, sizeof(bool), stream));
for (auto &h : handles) {
const uint8_t *src = h.deviceData();
for (uint32_t n=0; n<h.gridCount(); ++n) {
Expand All @@ -120,7 +120,7 @@ cudaMergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT*
src += h.gridSize(n);
}
}
cudaCheck(cudaFreeAsync(d_dirty, stream));
cudaCheck(CUDA_FREE(d_dirty, stream));
return GridHandle<BufferT>(std::move(buffer));
}// cudaMergeGridHandles

Expand Down
4 changes: 2 additions & 2 deletions nanovdb/nanovdb/util/cuda/CudaGridStats.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ void CudaGridStats<BuildT, StatsT>::operator()(NanoGrid<BuildT> *d_grid, cudaStr

StatsT *d_stats = nullptr;

if constexpr(StatsT::hasAverage()) cudaCheck(cudaMallocAsync((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream));
if constexpr(StatsT::hasAverage()) cudaCheck(CUDA_MALLOC((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream));

processLeaf<BuildT><<<blocksPerGrid(nodeCount[0]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);

Expand All @@ -220,7 +220,7 @@ void CudaGridStats<BuildT, StatsT>::operator()(NanoGrid<BuildT> *d_grid, cudaStr

processRootAndGrid<BuildT><<<1, 1, 0, stream>>>(d_nodeMgr, d_stats);

if constexpr(StatsT::hasAverage()) cudaCheck(cudaFreeAsync(d_stats, stream));
if constexpr(StatsT::hasAverage()) cudaCheck(CUDA_FREE(d_stats, stream));

} // CudaGridStats::operator()( Grid )

Expand Down
8 changes: 4 additions & 4 deletions nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public:
/// @param srcGrid Device pointer to IndexGrid used as the source
CudaIndexToGrid(const SrcGridT *d_srcGrid, cudaStream_t stream = 0);

~CudaIndexToGrid() {cudaCheck(cudaFreeAsync(mDevNodeAcc, mStream));}
~CudaIndexToGrid() {cudaCheck(CUDA_FREE(mDevNodeAcc, mStream));}

/// @brief Toggle on and off verbose mode
/// @param on if true verbose is turned on
Expand Down Expand Up @@ -295,7 +295,7 @@ CudaIndexToGrid<SrcBuildT>::CudaIndexToGrid(const SrcGridT *d_srcGrid, cudaStrea
: mStream(stream), mTimer(stream)
{
NANOVDB_ASSERT(d_srcGrid);
cudaCheck(cudaMallocAsync((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream));
cudaCheck(CUDA_MALLOC((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream));
cudaCpyNodeCount<SrcBuildT><<<1, 1, 0, mStream>>>(d_srcGrid, mDevNodeAcc);
cudaCheckError();
cudaCheck(cudaMemcpyAsync(&mNodeAcc, mDevNodeAcc, sizeof(NodeAccessor), cudaMemcpyDeviceToHost, mStream));// mNodeAcc = *mDevNodeAcc
Expand All @@ -319,7 +319,7 @@ GridHandle<BufferT> CudaIndexToGrid<SrcBuildT>::getHandle(const typename BuildTo
cudaProcessRootTiles<SrcBuildT,DstBuildT><<<mNodeAcc.nodeCount[3], 1, 0, mStream>>>(mDevNodeAcc, srcValues);
cudaCheckError();

cudaCheck(cudaFreeAsync(mNodeAcc.d_gridName, mStream));
cudaCheck(CUDA_FREE(mNodeAcc.d_gridName, mStream));

if (mVerbose) mTimer.restart("Process upper internal nodes");
cudaProcessInternalNodes<SrcBuildT,DstBuildT,2><<<mNodeAcc.nodeCount[2], dim3(32,32), 0, mStream>>>(mDevNodeAcc, srcValues);
Expand Down Expand Up @@ -362,7 +362,7 @@ inline BufferT CudaIndexToGrid<SrcBuildT>::getBuffer(const BufferT &pool)
if (mNodeAcc.d_dstPtr == nullptr) throw std::runtime_error("Failed memory allocation on the device");

if (size_t size = mGridName.size()) {
cudaCheck(cudaMallocAsync((void**)&mNodeAcc.d_gridName, size, mStream));
cudaCheck(CUDA_MALLOC((void**)&mNodeAcc.d_gridName, size, mStream));
cudaCheck(cudaMemcpyAsync(mNodeAcc.d_gridName, mGridName.data(), size, cudaMemcpyHostToDevice, mStream));
} else {
mNodeAcc.d_gridName = nullptr;
Expand Down
4 changes: 2 additions & 2 deletions nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ cudaCreateNodeManager(const NanoGrid<BuildT> *d_grid,
auto buffer = BufferT::create(sizeof(NodeManagerData), &pool, false, stream);
auto *d_data = (NodeManagerData*)buffer.deviceData();
size_t size = 0u, *d_size;
cudaCheck(cudaMallocAsync((void**)&d_size, sizeof(size_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_size, sizeof(size_t), stream));
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
*d_data = NodeManagerData{NANOVDB_MAGIC_NODE, 0u, (void*)d_grid, {0u,0u,0u}};
Expand All @@ -58,7 +58,7 @@ cudaCreateNodeManager(const NanoGrid<BuildT> *d_grid,
});
cudaCheckError();
cudaCheck(cudaMemcpyAsync(&size, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
cudaCheck(cudaFreeAsync(d_size, stream));
cudaCheck(CUDA_FREE(d_size, stream));
if (size > sizeof(NodeManagerData)) {
auto tmp = BufferT::create(size, &pool, false, stream);// only allocate buffer on the device
cudaCheck(cudaMemcpyAsync(tmp.deviceData(), buffer.deviceData(), sizeof(NodeManagerData), cudaMemcpyDeviceToDevice, stream));
Expand Down
4 changes: 2 additions & 2 deletions nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ public:
{
mData.map = map;
mData.flags.initMask({GridFlags::HasBBox, GridFlags::IsBreadthFirst});
cudaCheck(cudaMallocAsync((void**)&mDeviceData, sizeof(Data), mStream));
cudaCheck(CUDA_MALLOC((void**)&mDeviceData, sizeof(Data), mStream));
}

/// @brief Default constructor
Expand All @@ -276,7 +276,7 @@ public:
: CudaPointsToGrid(Map(scale, trans), stream) {}

/// @brief Destructor
~CudaPointsToGrid() {cudaCheck(cudaFreeAsync(mDeviceData, mStream));}
~CudaPointsToGrid() {cudaCheck(CUDA_FREE(mDeviceData, mStream));}

/// @brief Toggle on and off verbose mode
/// @param level Verbose level: 0=quiet, 1=timing, 2=benchmarking
Expand Down
4 changes: 2 additions & 2 deletions nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,11 @@ void CudaSignedFloodFill<BuildT>::operator()(NanoGrid<BuildT> *d_grid)
static_assert(BuildTraits<BuildT>::is_float, "CudaSignedFloodFill only works on float grids");
NANOVDB_ASSERT(d_grid);
uint64_t count[4], *d_count = nullptr;
cudaCheck(cudaMallocAsync((void**)&d_count, 4*sizeof(uint64_t), mStream));
cudaCheck(CUDA_MALLOC((void**)&d_count, 4*sizeof(uint64_t), mStream));
cudaCpyNodeCount<BuildT><<<1, 1, 0, mStream>>>(d_grid, d_count);
cudaCheckError();
cudaCheck(cudaMemcpyAsync(&count, d_count, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, mStream));
cudaCheck(cudaFreeAsync(d_count, mStream));
cudaCheck(CUDA_FREE(d_count, mStream));

static const int threadsPerBlock = 128;
auto blocksPerGrid = [&](size_t count)->uint32_t{return (count + (threadsPerBlock - 1)) / threadsPerBlock;};
Expand Down
43 changes: 29 additions & 14 deletions nanovdb/nanovdb/util/cuda/CudaUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,35 @@
cudaCheck(cudaGetLastError()); \
}

#if CUDART_VERSION < 11020 // 11.2 introduced cudaMallocAsync and cudaFreeAsync

/// @brief Dummy implementation of cudaMallocAsync that calls cudaMalloc
/// @param d_ptr Device pointer to allocated device memory
/// @param size Number of bytes to allocate
/// @param dummy The stream establishing the stream ordering contract and the memory pool to allocate from (ignored)
/// @return Cuda error code
inline cudaError_t cudaMallocAsync(void** d_ptr, size_t size, cudaStream_t){return cudaMalloc(d_ptr, size);}

/// @brief Dummy implementation of cudaFreeAsync that calls cudaFree
/// @param d_ptr Device pointer that will be freed
/// @param dummy The stream establishing the stream ordering promise (ignored)
/// @return Cuda error code
inline cudaError_t cudaFreeAsync(void* d_ptr, cudaStream_t){return cudaFree(d_ptr);}
// cudaMallocAsync and cudaFreeAsync were introduced in CUDA 11.2, for older CUDA
// versions fall back to cudaMalloc and cudaFree. The fallback can also be forced
// using the USE_SYNC_CUDA_MALLOC flag. This may be useful when deploying nanoVDB
// code in virtualized environments that share the GPU between instances by slicing
// it up into vGPU's. In such environments GPU unified memory is usually disabled
// out of security considerations, which means cudaMallocAsync can not be used.
#if (CUDART_VERSION < 11020) || defined(NANOVDB_USE_SYNC_CUDA_MALLOC)

#define CUDA_MALLOC(d_ptr, size, stream) \
{ \
cudaMalloc((d_ptr), (size)); \
}

#define CUDA_FREE(d_ptr, stream) \
{ \
cudaFree((d_ptr)); \
}

#else

#define CUDA_MALLOC(d_ptr, size, stream) \
{ \
cudaMallocAsync((d_ptr), (size), (stream)); \
}

#define CUDA_FREE(d_ptr, stream) \
{ \
cudaFreeAsync((d_ptr), (stream)); \
}

#endif

Expand Down

0 comments on commit f6e3254

Please sign in to comment.