diff --git a/src/care/CMakeLists.txt b/src/care/CMakeLists.txt index 6c441ae0..d15385f0 100644 --- a/src/care/CMakeLists.txt +++ b/src/care/CMakeLists.txt @@ -27,6 +27,7 @@ set(care_headers care_inst.h CHAICallback.h CHAIDataGetter.h + CudaUmpireResource.h GPUWatchpoint.h Debug.h DefaultMacros.h diff --git a/src/care/CudaUmpireResource.h b/src/care/CudaUmpireResource.h new file mode 100644 index 00000000..e01fd0f1 --- /dev/null +++ b/src/care/CudaUmpireResource.h @@ -0,0 +1,146 @@ +#ifndef CARE_CUDA_UMPIRE_RESOURCE_H +#define CARE_CUDA_UMPIRE_RESOURCE_H + +#include "camp/defines.hpp" + +#ifdef CAMP_ENABLE_CUDA + +#include // camp/resource/cuda.hpp is missing this include + +#include "camp/resource/cuda.hpp" + +#include "umpire/ResourceManager.hpp" +#include "umpire/Allocator.hpp" + +#include + +namespace care { + class CudaUmpireResource : public camp::resources::Cuda { + public: + CudaUmpireResource() : + m_resourceManager{&umpire::ResourceManager::getInstance()} + { + m_deviceAllocator = m_resourceManager->getAllocator("DEVICE"); + m_pinnedAllocator = m_resourceManager->getAllocator("PINNED"); + m_managedAllocator = m_resourceManager->getAllocator("UM"); + } + + CudaUmpireResource(const umpire::Allocator& deviceAllocator, + const umpire::Allocator& pinnedAllocator, + const umpire::Allocator& managedAllocator) : + m_resourceManager{&umpire::ResourceManager::getInstance()}, + m_deviceAllocator{deviceAllocator}, + m_pinnedAllocator{pinnedAllocator}, + m_managedAllocator{managedAllocator} + { + } + + // Memory + template + T *allocate(size_t size, camp::resources::MemoryAccess ma = camp::resources::MemoryAccess::Device) { + T *ret = nullptr; + + if (size > 0) { + auto d{camp::resources::device_guard(get_device())}; + + switch (ma) { + case camp::resources::MemoryAccess::Unknown: + case camp::resources::MemoryAccess::Device: + ret = static_cast(m_deviceAllocator.allocate(sizeof(T) * size)); + break; + case camp::resources::MemoryAccess::Pinned: + // TODO: do a test here for whether managed is *actually* shared + // so we can use the better performing memory + ret = static_cast(m_pinnedAllocator.allocate(sizeof(T) * size)); + break; + case camp::resources::MemoryAccess::Managed: + ret = static_cast(m_managedAllocator.allocate(sizeof(T) * size)); + break; + } + } + + return ret; + } + + void *calloc(size_t size, camp::resources::MemoryAccess ma = camp::resources::MemoryAccess::Device) { + void *p = allocate(size, ma); + this->memset(p, 0, size); + return p; + } + + void deallocate(void *p, camp::resources::MemoryAccess ma = camp::resources::MemoryAccess::Unknown) { + auto d{camp::resources::device_guard(get_device())}; + + if (ma == camp::resources::MemoryAccess::Unknown) { + ma = get_access_type(p); + } + + switch (ma) { + case camp::resources::MemoryAccess::Device: + m_deviceAllocator.deallocate(p); + break; + case camp::resources::MemoryAccess::Pinned: + // TODO: do a test here for whether managed is *actually* shared + // so we can use the better performing memory + m_pinnedAllocator.deallocate(p); + break; + case camp::resources::MemoryAccess::Managed: + m_managedAllocator.deallocate(p); + break; + case camp::resources::MemoryAccess::Unknown: + ::camp::throw_re("Unknown memory access type, cannot free"); + } + } + + void memcpy(void *dst, const void *src, size_t size) { + if (size > 0) { + auto d{camp::resources::device_guard(get_device())}; + camp::resources::Resource resource(*this); + m_resourceManager->copy(dst, const_cast(src), resource, size); + } + } + + void memset(void *p, int val, size_t size) + { + if (size > 0) { + auto d{camp::resources::device_guard(get_device())}; + camp::resources::Resource resource(*this); + m_resourceManager->memset(p, val, resource, size); + } + } + + private: + // TODO: Make this a public or protected method in camp + camp::resources::MemoryAccess get_access_type(void *p) { + cudaPointerAttributes a; + cudaError_t status = cudaPointerGetAttributes(&a, p); + if (status == cudaSuccess) { + switch(a.type){ + case cudaMemoryTypeUnregistered: + return camp::resources::MemoryAccess::Unknown; + case cudaMemoryTypeHost: + return camp::resources::MemoryAccess::Pinned; + case cudaMemoryTypeDevice: + return camp::resources::MemoryAccess::Device; + case cudaMemoryTypeManaged: + return camp::resources::MemoryAccess::Managed; + } + } + ::camp::throw_re("invalid pointer detected"); + // This return statement exists because compilers do not determine the + // above unconditionally throws + // related: https://stackoverflow.com/questions/64523302/cuda-missing-return-statement-at-end-of-non-void-function-in-constexpr-if-fun + return camp::resources::MemoryAccess::Unknown; + } + + umpire::ResourceManager* m_resourceManager; + + umpire::Allocator m_deviceAllocator; + umpire::Allocator m_pinnedAllocator; + umpire::Allocator m_managedAllocator; + }; // class CudaUmpireResource +} // namespace care + +#endif // CAMP_ENABLE_CUDA + +#endif // CARE_CUDA_UMPIRE_RESOURCE_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 44db52c4..90f98867 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -163,3 +163,18 @@ target_include_directories(Benchmarks blt_add_test( NAME Benchmarks COMMAND Benchmarks ) + +if (ENABLE_CUDA) + blt_add_executable( NAME TestCudaUmpireResource + SOURCES TestCudaUmpireResource.cpp + DEPENDS_ON ${care_test_dependencies} ) + + target_include_directories(TestCudaUmpireResource + PRIVATE ${PROJECT_SOURCE_DIR}/src) + + target_include_directories(TestCudaUmpireResource + PRIVATE ${PROJECT_BINARY_DIR}/include) + + blt_add_test( NAME TestCudaUmpireResource + COMMAND TestCudaUmpireResource ) +endif () diff --git a/test/TestCudaUmpireResource.cpp b/test/TestCudaUmpireResource.cpp new file mode 100644 index 00000000..93f9f33f --- /dev/null +++ b/test/TestCudaUmpireResource.cpp @@ -0,0 +1,71 @@ +////////////////////////////////////////////////////////////////////////////////////// +// Copyright 2020 Lawrence Livermore National Security, LLC and other CARE developers. +// See the top-level LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////////////// + +#include "care/config.h" + +#if defined(CARE_GPUCC) + +// other library headers +#include "gtest/gtest.h" +#include "umpire/strategy/QuickPool.hpp" + +// care headers +#include "care/CudaUmpireResource.h" +#include "care/detail/test_utils.h" + +GPU_TEST(CudaUmpireResource, gpu_initialization) { + init_care_for_testing(); +} + +GPU_TEST(CudaUmpireResource, DefaultConstructor) +{ + care::CudaUmpireResource resource; +} + +GPU_TEST(CudaUmpireResource, AllocatorConstructor) +{ + auto& rm = umpire::ResourceManager::getInstance(); + + // Device allocator + auto deviceAllocator = rm.getAllocator("DEVICE_POOL"); // Initialized above + auto customDeviceAllocator = + rm.makeAllocator("CUSTOM_DEVICE_POOL", + deviceAllocator, + 64*1024*1024, + 16*1024*1024); + + // Pinned allocator + auto pinnedAllocator = rm.getAllocator("PINNED_POOL"); // Initialized above + auto customPinnedAllocator = + rm.makeAllocator("CUSTOM_PINNED_POOL", + pinnedAllocator, + 8*1024*1024, + 2*1024*1024); + + // Managed allocator + auto managedAllocator = rm.getAllocator("UM"); // Umpire default + + // Make a unified memory pool to draw from (not done in init_care_for_testing()) + auto managedPoolAllocator = + rm.makeAllocator("UM_POOL", + managedAllocator, + 128*1024*1024, + 8*1024*1024); + + auto customManagedAllocator = + rm.makeAllocator("CUSTOM_UM_POOL", + managedPoolAllocator, + 8*1024*1024, + 2*1024*1024); + + care::CudaUmpireResource resource(customDeviceAllocator, + customPinnedAllocator, + customManagedAllocator); +} + +#endif // CARE_GPUCC + diff --git a/tpl/raja b/tpl/raja index 3774f513..d7bf64de 160000 --- a/tpl/raja +++ b/tpl/raja @@ -1 +1 @@ -Subproject commit 3774f51339459bbbdb77055aa23f82919b6335b6 +Subproject commit d7bf64def3ad8313eda53798f3e2c1479ac97545