Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a resource with custom allocator #230

Draft
wants to merge 8 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/care/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ set(care_headers
care_inst.h
CHAICallback.h
CHAIDataGetter.h
CudaUmpireResource.h
GPUWatchpoint.h
Debug.h
DefaultMacros.h
Expand Down
146 changes: 146 additions & 0 deletions src/care/CudaUmpireResource.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#ifndef CARE_CUDA_UMPIRE_RESOURCE_H
#define CARE_CUDA_UMPIRE_RESOURCE_H

#include "camp/defines.hpp"

#ifdef CAMP_ENABLE_CUDA

#include <mutex> // camp/resource/cuda.hpp is missing this include

#include "camp/resource/cuda.hpp"

#include "umpire/ResourceManager.hpp"
#include "umpire/Allocator.hpp"

#include <cuda_runtime.h>

namespace care {
class CudaUmpireResource : public camp::resources::Cuda {
public:
CudaUmpireResource() :
m_resourceManager{&umpire::ResourceManager::getInstance()}
{
m_deviceAllocator = m_resourceManager->getAllocator("DEVICE");
m_pinnedAllocator = m_resourceManager->getAllocator("PINNED");
m_managedAllocator = m_resourceManager->getAllocator("UM");
}

CudaUmpireResource(const umpire::Allocator& deviceAllocator,
const umpire::Allocator& pinnedAllocator,
const umpire::Allocator& managedAllocator) :
m_resourceManager{&umpire::ResourceManager::getInstance()},
m_deviceAllocator{deviceAllocator},
m_pinnedAllocator{pinnedAllocator},
m_managedAllocator{managedAllocator}
{
}

// Memory
template <typename T>
T *allocate(size_t size, camp::resources::MemoryAccess ma = camp::resources::MemoryAccess::Device) {
T *ret = nullptr;

if (size > 0) {
auto d{camp::resources::device_guard(get_device())};

switch (ma) {
case camp::resources::MemoryAccess::Unknown:
case camp::resources::MemoryAccess::Device:
ret = static_cast<T*>(m_deviceAllocator.allocate(sizeof(T) * size));
break;
case camp::resources::MemoryAccess::Pinned:
// TODO: do a test here for whether managed is *actually* shared
// so we can use the better performing memory
ret = static_cast<T*>(m_pinnedAllocator.allocate(sizeof(T) * size));
break;
case camp::resources::MemoryAccess::Managed:
ret = static_cast<T*>(m_managedAllocator.allocate(sizeof(T) * size));
break;
}
}

return ret;
}

void *calloc(size_t size, camp::resources::MemoryAccess ma = camp::resources::MemoryAccess::Device) {
void *p = allocate<char>(size, ma);
this->memset(p, 0, size);
return p;
}

void deallocate(void *p, camp::resources::MemoryAccess ma = camp::resources::MemoryAccess::Unknown) {
auto d{camp::resources::device_guard(get_device())};

if (ma == camp::resources::MemoryAccess::Unknown) {
ma = get_access_type(p);
}

switch (ma) {
case camp::resources::MemoryAccess::Device:
m_deviceAllocator.deallocate(p);
break;
case camp::resources::MemoryAccess::Pinned:
// TODO: do a test here for whether managed is *actually* shared
// so we can use the better performing memory
m_pinnedAllocator.deallocate(p);
break;
case camp::resources::MemoryAccess::Managed:
m_managedAllocator.deallocate(p);
break;
case camp::resources::MemoryAccess::Unknown:
::camp::throw_re("Unknown memory access type, cannot free");
}
}

void memcpy(void *dst, const void *src, size_t size) {
if (size > 0) {
auto d{camp::resources::device_guard(get_device())};
camp::resources::Resource resource(*this);
m_resourceManager->copy(dst, const_cast<void*>(src), resource, size);
}
}

void memset(void *p, int val, size_t size)
{
if (size > 0) {
auto d{camp::resources::device_guard(get_device())};
camp::resources::Resource resource(*this);
m_resourceManager->memset(p, val, resource, size);
}
}

private:
// TODO: Make this a public or protected method in camp
camp::resources::MemoryAccess get_access_type(void *p) {
cudaPointerAttributes a;
cudaError_t status = cudaPointerGetAttributes(&a, p);
if (status == cudaSuccess) {
switch(a.type){
case cudaMemoryTypeUnregistered:
return camp::resources::MemoryAccess::Unknown;
case cudaMemoryTypeHost:
return camp::resources::MemoryAccess::Pinned;
case cudaMemoryTypeDevice:
return camp::resources::MemoryAccess::Device;
case cudaMemoryTypeManaged:
return camp::resources::MemoryAccess::Managed;
}
}
::camp::throw_re("invalid pointer detected");
// This return statement exists because compilers do not determine the
// above unconditionally throws
// related: https://stackoverflow.com/questions/64523302/cuda-missing-return-statement-at-end-of-non-void-function-in-constexpr-if-fun
return camp::resources::MemoryAccess::Unknown;
}

umpire::ResourceManager* m_resourceManager;

umpire::Allocator m_deviceAllocator;
umpire::Allocator m_pinnedAllocator;
umpire::Allocator m_managedAllocator;
}; // class CudaUmpireResource
} // namespace care

#endif // CAMP_ENABLE_CUDA

#endif // CARE_CUDA_UMPIRE_RESOURCE_H
15 changes: 15 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,18 @@ target_include_directories(Benchmarks

blt_add_test( NAME Benchmarks
COMMAND Benchmarks )

if (ENABLE_CUDA)
blt_add_executable( NAME TestCudaUmpireResource
SOURCES TestCudaUmpireResource.cpp
DEPENDS_ON ${care_test_dependencies} )

target_include_directories(TestCudaUmpireResource
PRIVATE ${PROJECT_SOURCE_DIR}/src)

target_include_directories(TestCudaUmpireResource
PRIVATE ${PROJECT_BINARY_DIR}/include)

blt_add_test( NAME TestCudaUmpireResource
COMMAND TestCudaUmpireResource )
endif ()
71 changes: 71 additions & 0 deletions test/TestCudaUmpireResource.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
//////////////////////////////////////////////////////////////////////////////////////
// Copyright 2020 Lawrence Livermore National Security, LLC and other CARE developers.
// See the top-level LICENSE file for details.
//
// SPDX-License-Identifier: BSD-3-Clause
//////////////////////////////////////////////////////////////////////////////////////

#include "care/config.h"

#if defined(CARE_GPUCC)

// other library headers
#include "gtest/gtest.h"
#include "umpire/strategy/QuickPool.hpp"

// care headers
#include "care/CudaUmpireResource.h"
#include "care/detail/test_utils.h"

GPU_TEST(CudaUmpireResource, gpu_initialization) {
init_care_for_testing();
}

GPU_TEST(CudaUmpireResource, DefaultConstructor)
{
care::CudaUmpireResource resource;
}

GPU_TEST(CudaUmpireResource, AllocatorConstructor)
{
auto& rm = umpire::ResourceManager::getInstance();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@neelakausik, essentially you will need to create a device pool, a pinned pool, and a unified memory pool per thread, then use those to create a resource per thread. The call to init_care_for_testing() above creates a GPU wide pool called "DEVICE_POOL" and another pool called "PINNED_POOL". Those should be the basis for the per thread device and pinned memory pools. On line 53 I create a GPU wide unified memory pool called "UM_POOL". You will need to create that manually in whatever test cases you have, then have that allocator be the basis for the per thread unified memory pool.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should test out this functionality in standalone CARE. Write some tests and benchmarks that involve RAJA reducers, scans, and CARE loops.


// Device allocator
auto deviceAllocator = rm.getAllocator("DEVICE_POOL"); // Initialized above
auto customDeviceAllocator =
rm.makeAllocator<umpire::strategy::QuickPool>("CUSTOM_DEVICE_POOL",
deviceAllocator,
64*1024*1024,
16*1024*1024);

// Pinned allocator
auto pinnedAllocator = rm.getAllocator("PINNED_POOL"); // Initialized above
auto customPinnedAllocator =
rm.makeAllocator<umpire::strategy::QuickPool>("CUSTOM_PINNED_POOL",
pinnedAllocator,
8*1024*1024,
2*1024*1024);

// Managed allocator
auto managedAllocator = rm.getAllocator("UM"); // Umpire default

// Make a unified memory pool to draw from (not done in init_care_for_testing())
auto managedPoolAllocator =
rm.makeAllocator<umpire::strategy::QuickPool>("UM_POOL",
managedAllocator,
128*1024*1024,
8*1024*1024);

auto customManagedAllocator =
rm.makeAllocator<umpire::strategy::QuickPool>("CUSTOM_UM_POOL",
managedPoolAllocator,
8*1024*1024,
2*1024*1024);

care::CudaUmpireResource resource(customDeviceAllocator,
customPinnedAllocator,
customManagedAllocator);
}

#endif // CARE_GPUCC