Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added forall_with_streams and updated BenchmarkForall.cpp #232

Open
wants to merge 19 commits into
base: develop
Choose a base branch
from
106 changes: 106 additions & 0 deletions benchmarks/BenchmarkRaja.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
//////////////////////////////////////////////////////////////////////////////////////
// Copyright 2020 Lawrence Livermore National Security, LLC and other CARE developers.
// See the top-level LICENSE file for details.
//
// SPDX-License-Identifier: BSD-3-Clause
//////////////////////////////////////////////////////////////////////////////////////

// CARE headers
#include "care/DefaultMacros.h"
#include "care/host_device_ptr.h"
#include "care/forall.h"
#include "care/policies.h"
#include "RAJA/RAJA.hpp"

// Other library headers
#include <benchmark/benchmark.h>
#include <omp.h>

// Std library headers
#include <climits>
#include <cmath>

#define size 1000000

#if defined(CARE_GPUCC)
//each kernel has a separate stream
static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
int N = state.range(0);
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
care::Resource res_arr[16];
RAJA::resources::Event event_arr[16];
care::host_device_ptr<int> arrays[16];
for(int i = 0; i < N; i++)
{
res_arr[i] = care::Resource();
event_arr[i] = res_arr[i].get_event();
arrays[i] = care::host_device_ptr<int>(size, "arr");
}

//warmup kernel
CARE_GPU_LOOP(i, 0 , size) {
arrays[0][i] = 0;
} CARE_GPU_LOOP_END

care::gpuDeviceSynchronize(__FILE__, __LINE__);

for (auto _ : state) {
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
//run num kernels
omp_set_num_threads(N);
#pragma omp parallel for
for(int j = 0; j < N; j++)
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
{
CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) {
arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j);
} CARE_STREAMED_LOOP_END
}
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
care::gpuDeviceSynchronize(__FILE__, __LINE__);
}

for(int i = 0; i < N; i++){
arrays[i].free();
}
}

// Register the function as a benchmark
BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16);

//all kernels on one stream
static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
int N = state.range(0);

care::host_device_ptr<int> arrays[16];
for(int i = 0; i < N; i++)
{
arrays[i] = care::host_device_ptr<int>(size, "arr");
}

//warmup kernel
CARE_GPU_LOOP(i, 0, size) {
arrays[0][i] = 0;
} CARE_GPU_LOOP_END

care::gpuDeviceSynchronize(__FILE__, __LINE__);

for (auto _ : state) {
//run num kernels
for(int j = 0; j < N; j++)
{
CARE_GPU_LOOP(i, 0, size) {
arrays[j][i] = sqrtf(i) + cosf(j) * powf(i, j);
} CARE_GPU_LOOP_END
}
care::gpuDeviceSynchronize(__FILE__, __LINE__);
}

for(int i = 0; i < N; i++){
arrays[i].free();
}
}

// Register the function as a benchmark
BENCHMARK(benchmark_gpu_loop_single_stream)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16);

#endif

// Run the benchmarks
BENCHMARK_MAIN();
13 changes: 13 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,19 @@ target_include_directories(BenchmarkForall
blt_add_benchmark(NAME BenchmarkForall
COMMAND BenchmarkForall)

blt_add_executable(NAME BenchmarkRaja
SOURCES BenchmarkRaja.cpp
DEPENDS_ON ${care_benchmark_depends})

target_include_directories(BenchmarkRaja
PRIVATE ${PROJECT_SOURCE_DIR}/src)

target_include_directories(BenchmarkRaja
PRIVATE ${PROJECT_BINARY_DIR}/include)

blt_add_benchmark(NAME BenchmarkRaja
COMMAND BenchmarkRaja)

blt_add_executable(NAME BenchmarkNumeric
SOURCES BenchmarkNumeric.cpp
DEPENDS_ON ${care_benchmark_depends})
Expand Down
17 changes: 17 additions & 0 deletions src/care/DefaultMacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,10 @@

#define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)

#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK)

#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)

////////////////////////////////////////////////////////////////////////////////
///
/// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
Expand Down Expand Up @@ -548,6 +552,15 @@
#define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \
CARE_NEST_END(CHECK) }}

#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) { \
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
if (END_INDEX > START_INDEX) { \
CARE_NEST_BEGIN(CHECK) \
care::forall_with_stream(care::gpu{}, RESOURCE, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { \
CARE_SET_THREAD_ID(INDEX)

#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) }); \
CARE_NEST_END(CHECK) }}

////////////////////////////////////////////////////////////////////////////////
///
/// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
Expand Down Expand Up @@ -753,6 +766,10 @@

#define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check)

#define CARE_STREAMED_LOOP(RESOURCE, INDEX, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, care_streamed_loop_check)

#define CARE_STREAMED_LOOP_END CARE_CHECKED_STREAMED_LOOP_END(care_streamed_loop_check)

////////////////////////////////////////////////////////////////////////////////
///
/// @brief Macros that start and end a RAJA loop that uses at least one
Expand Down
93 changes: 92 additions & 1 deletion src/care/forall.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,26 @@ namespace care {
#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
static bool s_reverseLoopOrder = false;
#endif

template <typename T>
struct ExecutionPolicyToSpace {
static constexpr const chai::ExecutionSpace value = chai::CPU;
};

#if defined(__CUDACC__)
typedef RAJA::resources::Cuda Resource;
template <>
struct ExecutionPolicyToSpace<RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>> {
static constexpr const chai::ExecutionSpace value = chai::GPU;
};
#elif defined (__HIPCC__)
typedef RAJA::resources::Hip Resource;
template <>
struct ExecutionPolicyToSpace<RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>> {
static constexpr const chai::ExecutionSpace value = chai::GPU;
};
#else
typedef RAJA::resources::Host Resource;
#endif

#if CARE_ENABLE_GPU_SIMULATION_MODE
Expand Down Expand Up @@ -97,6 +101,50 @@ namespace care {
}
}

////////////////////////////////////////////////////////////////////////////////
///
/// @author Peter Robinson, Alan Dayton
///
/// @brief Loops over the given indices and calls the loop body with each index.
/// This overload is CHAI and RAJA aware and sets the execution space accordingly.
///
/// @arg[in] policy Used to choose this overload of forall
/// @arg[in] res Resource to be used
/// @arg[in] fileName The name of the file where this function is called
/// @arg[in] lineNumber The line number in the file where this function is called
/// @arg[in] start The starting index (inclusive)
/// @arg[in] end The ending index (exclusive)
/// @arg[in] body The loop body to execute at each index
///
////////////////////////////////////////////////////////////////////////////////
template <typename R, typename ExecutionPolicy, typename LB>
void forall(ExecutionPolicy /* policy */, R res, const char * fileName, const int lineNumber,
const int start, const int end, LB&& body) {
const int length = end - start;

if (length != 0) {
PluginData::setFileName(fileName);
PluginData::setLineNumber(lineNumber);


#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
RAJA::RangeStrideSegment rangeSegment =
s_reverseLoopOrder ?
RAJA::RangeStrideSegment(end - 1, start - 1, -1) :
RAJA::RangeStrideSegment(start, end, 1);
#else
RAJA::RangeSegment rangeSegment = RAJA::RangeSegment(start, end);
#endif

#if CARE_ENABLE_GPU_SIMULATION_MODE
RAJA::forall<RAJA::seq_exec>(res, rangeSegment, std::forward<LB>(body));
#else
RAJA::forall<ExecutionPolicy>(res, rangeSegment, std::forward<LB>(body));
#endif
}
}


////////////////////////////////////////////////////////////////////////////////
///
/// @author Alan Dayton
Expand Down Expand Up @@ -189,6 +237,49 @@ namespace care {
#endif
}

////////////////////////////////////////////////////////////////////////////////
///
/// @author Neela Kausik
///
/// @brief If GPU is available, execute on the device. Otherwise, execute on
/// the host. This specialization is needed for clang-query.
///
/// @arg[in] gpu Used to choose this overload of forall
/// @arg[in] res Resource provided for execution
/// @arg[in] fileName The name of the file where this function is called
/// @arg[in] lineNumber The line number in the file where this function is called
/// @arg[in] start The starting index (inclusive)
/// @arg[in] end The ending index (exclusive)
/// @arg[in] body The loop body to execute at each index
///
////////////////////////////////////////////////////////////////////////////////

#if defined(CARE_GPUCC)
template <typename LB>
void forall_with_stream(gpu, Resource res, const char * fileName, const int lineNumber,
const int start, const int end, LB&& body) {
#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
s_reverseLoopOrder = true;
#endif

#if CARE_ENABLE_GPU_SIMULATION_MODE
forall(gpu_simulation{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
#elif defined(__CUDACC__)
forall(RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
res, fileName, lineNumber, start, end, std::forward<LB>(body));
#elif defined(__HIPCC__)
forall(RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
res, fileName, lineNumber, start, end, std::forward<LB>(body));
#else
forall(RAJA::seq_exec{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
#endif

#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
s_reverseLoopOrder = false;
#endif
}
#endif

////////////////////////////////////////////////////////////////////////////////
///
/// @author Alan Dayton
Expand Down