diff --git a/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh b/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh index c750412458..ad6a447764 100644 --- a/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh @@ -62,7 +62,7 @@ cudaAddBlindData(const NanoGrid *d_grid, // extract byte sizes of the grid, blind meta data and blind data enum {GRID=0, META=1, DATA=2, CHECKSUM=3}; uint64_t tmp[4], *d_tmp; - cudaCheck(cudaMallocAsync((void**)&d_tmp, 4*sizeof(uint64_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_tmp, 4*sizeof(uint64_t), stream)); cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) { if (auto count = d_grid->blindDataCount()) { d_tmp[GRID] = PtrDiff(&d_grid->blindMetaData(0), d_grid); @@ -114,7 +114,7 @@ cudaAddBlindData(const NanoGrid *d_grid, for (uint32_t i=0, n=grid.mBlindMetadataCount-1; imDataOffset += sizeof(GridBlindMetaData); grid.mGridSize += sizeof(GridBlindMetaData) + meta->blindDataSize();// expansion with 32 byte alignment }); cudaCheckError(); - cudaCheck(cudaFreeAsync(d_tmp, stream)); + cudaCheck(CUDA_FREE(d_tmp, stream)); GridChecksum cs(tmp[CHECKSUM]); cudaGridChecksum(reinterpret_cast(d_data), cs.mode()); diff --git a/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h b/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h index 4b9820771d..3a3a450bd6 100644 --- a/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h +++ b/nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h @@ -153,7 +153,7 @@ inline void CudaDeviceBuffer::init(uint64_t size, bool host, void* stream) cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned checkPtr(mCpuData, "CudaDeviceBuffer::init: failed to allocate host buffer"); } else { - cudaCheck(cudaMallocAsync((void**)&mGpuData, size, reinterpret_cast(stream))); // un-managed memory on the device, always 32B aligned! + cudaCheck(CUDA_MALLOC((void**)&mGpuData, size, reinterpret_cast(stream))); // un-managed memory on the device, always 32B aligned! checkPtr(mGpuData, "CudaDeviceBuffer::init: failed to allocate device buffer"); } mSize = size; @@ -163,7 +163,7 @@ inline void CudaDeviceBuffer::deviceUpload(void* stream, bool sync) const { checkPtr(mCpuData, "uninitialized cpu data"); if (mGpuData == nullptr) { - cudaCheck(cudaMallocAsync((void**)&mGpuData, mSize, reinterpret_cast(stream))); // un-managed memory on the device, always 32B aligned! + cudaCheck(CUDA_MALLOC((void**)&mGpuData, mSize, reinterpret_cast(stream))); // un-managed memory on the device, always 32B aligned! } checkPtr(mGpuData, "uninitialized gpu data"); cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, reinterpret_cast(stream))); @@ -183,7 +183,7 @@ inline void CudaDeviceBuffer::deviceDownload(void* stream, bool sync) const inline void CudaDeviceBuffer::clear(void *stream) { - if (mGpuData) cudaCheck(cudaFreeAsync(mGpuData, reinterpret_cast(stream))); + if (mGpuData) cudaCheck(CUDA_FREE(mGpuData, reinterpret_cast(stream))); if (mCpuData) cudaCheck(cudaFreeHost(mCpuData)); mCpuData = mGpuData = nullptr; mSize = 0; diff --git a/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh b/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh index e3ae9a941f..c1d6231636 100644 --- a/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh @@ -67,7 +67,7 @@ __global__ void checksumKernel(const T *d_data, uint32_t* d_blockCRC, uint32_t b inline uint32_t* cudaCreateLut(cudaStream_t stream = 0) { uint32_t *d_lut; - cudaCheck(cudaMallocAsync((void**)&d_lut, 256*sizeof(uint32_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_lut, 256*sizeof(uint32_t), stream)); initLutKernel<<<1, 256, 0, stream>>>(d_lut); cudaCheckError(); return d_lut; @@ -93,10 +93,10 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C uint8_t *d_begin = reinterpret_cast(d_gridData); uint32_t *d_lut = crc32::cudaCreateLut(stream);// allocate and generate device LUT for CRC32 uint64_t size[2], *d_size;// {total size of grid, partial size for first checksum} - cudaCheck(cudaMallocAsync((void**)&d_size, 2*sizeof(uint64_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_size, 2*sizeof(uint64_t), stream)); // Compute CRC32 checksum of GridData, TreeData, RootData (+tiles), but exclude GridData::mMagic and GridData::mChecksum - cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) { + cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__ (size_t) { d_size[0] = d_gridData->mGridSize; uint8_t *d_mid = d_gridData->template nodePtr<2>(); if (d_mid == nullptr) {// no upper nodes @@ -112,7 +112,7 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C }); cudaCheckError(); cudaCheck(cudaMemcpyAsync(size, d_size, 2*sizeof(uint64_t), cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaFreeAsync(d_size, stream)); + cudaCheck(CUDA_FREE(d_size, stream)); if (mode != ChecksumMode::Full || size[0] == size[1]) return ChecksumMode::Partial; @@ -120,7 +120,7 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C const uint8_t *d_mid = d_begin + size[1], *d_end = d_begin + size[0]; uint32_t *d_checksums;// 4096 byte chunks const uint64_t checksumCount = (d_end - d_mid) >> NANOVDB_CRC32_LOG2_BLOCK_SIZE;// 4 KB (4096 byte) - cudaCheck(cudaMallocAsync((void**)&d_checksums, checksumCount*sizeof(uint32_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_checksums, checksumCount*sizeof(uint32_t), stream)); cudaLambdaKernel<<>>(checksumCount, [=] __device__(size_t tid) { uint32_t size = 1< *d_grid, ChecksumMode mode inline GridChecksum cudaGetGridChecksum(GridData *d_gridData, cudaStream_t stream = 0) { uint64_t checksum, *d_checksum; - cudaCheck(cudaMallocAsync((void**)&d_checksum, sizeof(uint64_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_checksum, sizeof(uint64_t), stream)); cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {*d_checksum = d_gridData->mChecksum;}); cudaCheckError(); cudaCheck(cudaMemcpyAsync(&checksum, d_checksum, sizeof(uint64_t), cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaFreeAsync(d_checksum, stream)); + cudaCheck(CUDA_FREE(d_checksum, stream)); return GridChecksum(checksum);; } @@ -172,7 +172,7 @@ void cudaGridChecksum(NanoGrid *d_grid, ChecksumMode mode = ChecksumMode uint32_t *d_lut = crc32::cudaCreateLut(stream);// allocate and generate device LUT for CRC32 uint64_t size[2], *d_size; - cudaCheck(cudaMallocAsync((void**)&d_size, 2*sizeof(uint64_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_size, 2*sizeof(uint64_t), stream)); cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) { d_size[0] = d_grid->gridSize(); d_size[1] = d_grid->memUsage() + d_grid->tree().memUsage() + d_grid->tree().root().memUsage(); @@ -188,15 +188,15 @@ void cudaGridChecksum(NanoGrid *d_grid, ChecksumMode mode = ChecksumMode // Get node counts uint32_t nodeCount[3], *d_nodeCount, *d_checksums, *d_ptr; - cudaCheck(cudaMallocAsync((void**)&d_nodeCount, 3*sizeof(uint32_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_nodeCount, 3*sizeof(uint32_t), stream)); cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) { auto &tree = d_grid->tree(); for (int i = 0; i < 3; ++i) d_nodeCount[i] = tree.nodeCount(i); }); cudaCheckError(); cudaCheck(cudaMemcpyAsync(nodeCount, d_nodeCount, 3*sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaFreeAsync(d_nodeCount, stream)); - cudaCheck(cudaMallocAsync((void**)&d_checksums, (nodeCount[0]+nodeCount[1]+nodeCount[2])*sizeof(uint32_t), stream)); + cudaCheck(CUDA_FREE(d_nodeCount, stream)); + cudaCheck(CUDA_MALLOC((void**)&d_checksums, (nodeCount[0]+nodeCount[1]+nodeCount[2])*sizeof(uint32_t), stream)); auto nodeMgrHandle = cudaCreateNodeManager(d_grid, CudaDeviceBuffer(), stream); auto *d_nodeMgr = nodeMgrHandle.template deviceMgr(); @@ -232,9 +232,9 @@ void cudaGridChecksum(NanoGrid *d_grid, ChecksumMode mode = ChecksumMode }); cudaCheckError(); - cudaCheck(cudaFreeAsync(d_size, stream)); - cudaCheck(cudaFreeAsync(d_checksums, stream)); - cudaCheck(cudaFreeAsync(d_lut, stream)); + cudaCheck(CUDA_FREE(d_size, stream)); + cudaCheck(CUDA_FREE(d_checksums, stream)); + cudaCheck(CUDA_FREE(d_lut, stream)); }// cudaGridChecksum #endif diff --git a/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh b/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh index 5446c56231..dd547fb1d4 100644 --- a/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh @@ -71,7 +71,7 @@ cudaSplitGridHandles(const GridHandle &handle, const BufferT* other = n if (ptr == nullptr) return VectorT>(); VectorT> handles(handle.gridCount()); bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed - cudaCheck(cudaMallocAsync((void**)&d_dirty, sizeof(bool), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_dirty, sizeof(bool), stream)); for (uint32_t n=0; n(buffer.deviceData()); @@ -84,7 +84,7 @@ cudaSplitGridHandles(const GridHandle &handle, const BufferT* other = n handles[n] = GridHandle(std::move(buffer)); ptr += handle.gridSize(n); } - cudaCheck(cudaFreeAsync(d_dirty, stream)); + cudaCheck(CUDA_FREE(d_dirty, stream)); return std::move(handles); }// cudaSplitGridHandles @@ -106,7 +106,7 @@ cudaMergeGridHandles(const VectorT> &handles, const BufferT* auto buffer = BufferT::create(size, other, false, stream); uint8_t *dst = buffer.deviceData(); bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed - cudaCheck(cudaMallocAsync((void**)&d_dirty, sizeof(bool), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_dirty, sizeof(bool), stream)); for (auto &h : handles) { const uint8_t *src = h.deviceData(); for (uint32_t n=0; n> &handles, const BufferT* src += h.gridSize(n); } } - cudaCheck(cudaFreeAsync(d_dirty, stream)); + cudaCheck(CUDA_FREE(d_dirty, stream)); return GridHandle(std::move(buffer)); }// cudaMergeGridHandles diff --git a/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh b/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh index dcf5bfc850..0c4b970537 100644 --- a/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaGridStats.cuh @@ -210,7 +210,7 @@ void CudaGridStats::operator()(NanoGrid *d_grid, cudaStr StatsT *d_stats = nullptr; - if constexpr(StatsT::hasAverage()) cudaCheck(cudaMallocAsync((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream)); + if constexpr(StatsT::hasAverage()) cudaCheck(CUDA_MALLOC((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream)); processLeaf<<>>(d_nodeMgr, d_stats); @@ -220,7 +220,7 @@ void CudaGridStats::operator()(NanoGrid *d_grid, cudaStr processRootAndGrid<<<1, 1, 0, stream>>>(d_nodeMgr, d_stats); - if constexpr(StatsT::hasAverage()) cudaCheck(cudaFreeAsync(d_stats, stream)); + if constexpr(StatsT::hasAverage()) cudaCheck(CUDA_FREE(d_stats, stream)); } // CudaGridStats::operator()( Grid ) diff --git a/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh b/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh index 8394ecefe1..2374ba2f71 100644 --- a/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh @@ -64,7 +64,7 @@ public: /// @param srcGrid Device pointer to IndexGrid used as the source CudaIndexToGrid(const SrcGridT *d_srcGrid, cudaStream_t stream = 0); - ~CudaIndexToGrid() {cudaCheck(cudaFreeAsync(mDevNodeAcc, mStream));} + ~CudaIndexToGrid() {cudaCheck(CUDA_FREE(mDevNodeAcc, mStream));} /// @brief Toggle on and off verbose mode /// @param on if true verbose is turned on @@ -295,7 +295,7 @@ CudaIndexToGrid::CudaIndexToGrid(const SrcGridT *d_srcGrid, cudaStrea : mStream(stream), mTimer(stream) { NANOVDB_ASSERT(d_srcGrid); - cudaCheck(cudaMallocAsync((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream)); + cudaCheck(CUDA_MALLOC((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream)); cudaCpyNodeCount<<<1, 1, 0, mStream>>>(d_srcGrid, mDevNodeAcc); cudaCheckError(); cudaCheck(cudaMemcpyAsync(&mNodeAcc, mDevNodeAcc, sizeof(NodeAccessor), cudaMemcpyDeviceToHost, mStream));// mNodeAcc = *mDevNodeAcc @@ -319,7 +319,7 @@ GridHandle CudaIndexToGrid::getHandle(const typename BuildTo cudaProcessRootTiles<<>>(mDevNodeAcc, srcValues); cudaCheckError(); - cudaCheck(cudaFreeAsync(mNodeAcc.d_gridName, mStream)); + cudaCheck(CUDA_FREE(mNodeAcc.d_gridName, mStream)); if (mVerbose) mTimer.restart("Process upper internal nodes"); cudaProcessInternalNodes<<>>(mDevNodeAcc, srcValues); @@ -362,7 +362,7 @@ inline BufferT CudaIndexToGrid::getBuffer(const BufferT &pool) if (mNodeAcc.d_dstPtr == nullptr) throw std::runtime_error("Failed memory allocation on the device"); if (size_t size = mGridName.size()) { - cudaCheck(cudaMallocAsync((void**)&mNodeAcc.d_gridName, size, mStream)); + cudaCheck(CUDA_MALLOC((void**)&mNodeAcc.d_gridName, size, mStream)); cudaCheck(cudaMemcpyAsync(mNodeAcc.d_gridName, mGridName.data(), size, cudaMemcpyHostToDevice, mStream)); } else { mNodeAcc.d_gridName = nullptr; diff --git a/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh b/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh index 3d35a4b902..7ffb1db948 100644 --- a/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh @@ -38,7 +38,7 @@ cudaCreateNodeManager(const NanoGrid *d_grid, auto buffer = BufferT::create(sizeof(NodeManagerData), &pool, false, stream); auto *d_data = (NodeManagerData*)buffer.deviceData(); size_t size = 0u, *d_size; - cudaCheck(cudaMallocAsync((void**)&d_size, sizeof(size_t), stream)); + cudaCheck(CUDA_MALLOC((void**)&d_size, sizeof(size_t), stream)); cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) { #ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS *d_data = NodeManagerData{NANOVDB_MAGIC_NODE, 0u, (void*)d_grid, {0u,0u,0u}}; @@ -58,7 +58,7 @@ cudaCreateNodeManager(const NanoGrid *d_grid, }); cudaCheckError(); cudaCheck(cudaMemcpyAsync(&size, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream)); - cudaCheck(cudaFreeAsync(d_size, stream)); + cudaCheck(CUDA_FREE(d_size, stream)); if (size > sizeof(NodeManagerData)) { auto tmp = BufferT::create(size, &pool, false, stream);// only allocate buffer on the device cudaCheck(cudaMemcpyAsync(tmp.deviceData(), buffer.deviceData(), sizeof(NodeManagerData), cudaMemcpyDeviceToDevice, stream)); diff --git a/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh b/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh index 733dc35cb9..d926d1a344 100644 --- a/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh @@ -265,7 +265,7 @@ public: { mData.map = map; mData.flags.initMask({GridFlags::HasBBox, GridFlags::IsBreadthFirst}); - cudaCheck(cudaMallocAsync((void**)&mDeviceData, sizeof(Data), mStream)); + cudaCheck(CUDA_MALLOC((void**)&mDeviceData, sizeof(Data), mStream)); } /// @brief Default constructor @@ -276,7 +276,7 @@ public: : CudaPointsToGrid(Map(scale, trans), stream) {} /// @brief Destructor - ~CudaPointsToGrid() {cudaCheck(cudaFreeAsync(mDeviceData, mStream));} + ~CudaPointsToGrid() {cudaCheck(CUDA_FREE(mDeviceData, mStream));} /// @brief Toggle on and off verbose mode /// @param level Verbose level: 0=quiet, 1=timing, 2=benchmarking diff --git a/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh b/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh index 2f4bf203d6..eb40a35aae 100644 --- a/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh +++ b/nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh @@ -153,11 +153,11 @@ void CudaSignedFloodFill::operator()(NanoGrid *d_grid) static_assert(BuildTraits::is_float, "CudaSignedFloodFill only works on float grids"); NANOVDB_ASSERT(d_grid); uint64_t count[4], *d_count = nullptr; - cudaCheck(cudaMallocAsync((void**)&d_count, 4*sizeof(uint64_t), mStream)); + cudaCheck(CUDA_MALLOC((void**)&d_count, 4*sizeof(uint64_t), mStream)); cudaCpyNodeCount<<<1, 1, 0, mStream>>>(d_grid, d_count); cudaCheckError(); cudaCheck(cudaMemcpyAsync(&count, d_count, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, mStream)); - cudaCheck(cudaFreeAsync(d_count, mStream)); + cudaCheck(CUDA_FREE(d_count, mStream)); static const int threadsPerBlock = 128; auto blocksPerGrid = [&](size_t count)->uint32_t{return (count + (threadsPerBlock - 1)) / threadsPerBlock;}; diff --git a/nanovdb/nanovdb/util/cuda/CudaUtils.h b/nanovdb/nanovdb/util/cuda/CudaUtils.h index 40001748ee..cfac74cb99 100644 --- a/nanovdb/nanovdb/util/cuda/CudaUtils.h +++ b/nanovdb/nanovdb/util/cuda/CudaUtils.h @@ -53,20 +53,35 @@ cudaCheck(cudaGetLastError()); \ } -#if CUDART_VERSION < 11020 // 11.2 introduced cudaMallocAsync and cudaFreeAsync - -/// @brief Dummy implementation of cudaMallocAsync that calls cudaMalloc -/// @param d_ptr Device pointer to allocated device memory -/// @param size Number of bytes to allocate -/// @param dummy The stream establishing the stream ordering contract and the memory pool to allocate from (ignored) -/// @return Cuda error code -inline cudaError_t cudaMallocAsync(void** d_ptr, size_t size, cudaStream_t){return cudaMalloc(d_ptr, size);} - -/// @brief Dummy implementation of cudaFreeAsync that calls cudaFree -/// @param d_ptr Device pointer that will be freed -/// @param dummy The stream establishing the stream ordering promise (ignored) -/// @return Cuda error code -inline cudaError_t cudaFreeAsync(void* d_ptr, cudaStream_t){return cudaFree(d_ptr);} +// cudaMallocAsync and cudaFreeAsync were introduced in CUDA 11.2, for older CUDA +// versions fall back to cudaMalloc and cudaFree. The fallback can also be forced +// using the NANOVDB_USE_SYNC_CUDA_MALLOC flag. This may be useful when deploying nanoVDB +// code in virtualized environments that share the GPU between instances by slicing +// it up into vGPU's. In such environments GPU unified memory is usually disabled +// out of security considerations, which means cudaMallocAsync can not be used. +#if (CUDART_VERSION < 11020) || defined(NANOVDB_USE_SYNC_CUDA_MALLOC) + +#define CUDA_MALLOC(d_ptr, size, stream) \ + { \ + cudaMalloc((d_ptr), (size)); \ + } + +#define CUDA_FREE(d_ptr, stream) \ + { \ + cudaFree((d_ptr)); \ + } + +#else + +#define CUDA_MALLOC(d_ptr, size, stream) \ + { \ + cudaMallocAsync((d_ptr), (size), (stream)); \ + } + +#define CUDA_FREE(d_ptr, stream) \ + { \ + cudaFreeAsync((d_ptr), (stream)); \ + } #endif @@ -133,4 +148,4 @@ __device__ inline bool cudaStrEq(const char *lhs, const char *rhs) #endif// __CUDACC__ -#endif// NANOVDB_CUDA_UTILS_H_HAS_BEEN_INCLUDED \ No newline at end of file +#endif// NANOVDB_CUDA_UTILS_H_HAS_BEEN_INCLUDED