Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added forall_with_streams and updated BenchmarkForall.cpp #232

Open
wants to merge 19 commits into
base: develop
Choose a base branch
from
105 changes: 105 additions & 0 deletions benchmarks/BenchmarkRaja.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
//////////////////////////////////////////////////////////////////////////////////////
// Copyright 2020 Lawrence Livermore National Security, LLC and other CARE developers.
// See the top-level LICENSE file for details.
//
// SPDX-License-Identifier: BSD-3-Clause
//////////////////////////////////////////////////////////////////////////////////////

// CARE headers
#include "care/DefaultMacros.h"
#include "care/host_device_ptr.h"
#include "care/forall.h"
#include "care/policies.h"
#include "RAJA/RAJA.hpp"

// Other library headers
#include <benchmark/benchmark.h>
#include <omp.h>

// Std library headers
#include <climits>
#include <cmath>

#define size 1000000

namespace care{
adayton1 marked this conversation as resolved.
Show resolved Hide resolved

#if defined(CARE_GPUCC)
//each kernel has a separate stream
static void benchmark_gpu_loop_separate_streams(benchmark::State& state) {
int N = state.range(0);
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
RAJA::resources::Cuda res_arr[N];
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
RAJA::resources::Event event_arr[N];
care::host_device_ptr<int> arrays[16];
for(int i = 0; i < N; i++)
{
RAJA::resources::Cuda res;
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
res_arr[i] = res;
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
RAJA::resources::Event e = res.get_event();
event_arr[i] = e;
care::host_device_ptr<int> arr(size, "arr");
arrays[i] = arr;
}

//warmup kernel
RAJA::resources::Cuda warmup_res;
CARE_STREAMED_LOOP(warmup_res, i, 0 , size) {
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
arrays[0][i] = 0;
} CARE_STREAMED_LOOP_END

for (auto _ : state) {
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
//run num kernels
for(int j = 0; j < N; j++)
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
{
CARE_STREAMED_LOOP(res_arr[j], i, 0 , size) {
arrays[j][i] = i;
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
} CARE_STREAMED_LOOP_END
}
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
}
for(int i = 0; i < N; i++) {arrays[i].free();}
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
}

// Register the function as a benchmark
BENCHMARK(benchmark_gpu_loop_separate_streams)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16);

//all kernels on one stream
static void benchmark_gpu_loop_single_stream(benchmark::State& state) {
int N = state.range(0);

RAJA::resources::Cuda res;

care::host_device_ptr<int> arrays[16];
for(int i = 0; i < N; i++)
{
care::host_device_ptr<int> arr(size, "arr");
arrays[i] = arr;
}

//warmup kernel
RAJA::resources::Cuda warmup_res;
CARE_STREAMED_LOOP(warmup_res, i, 0, size) {
arrays[0][i] = i;
} CARE_STREAMED_LOOP_END

for (auto _ : state) {
//run num kernels
for(int j = 0; j < N; j++)
{
CARE_STREAMED_LOOP(res, i, 0, size) {
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
arrays[j][i] = i;
} CARE_STREAMED_LOOP_END
res.wait();
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
}
}
for(int i = 0; i < N; i++) {arrays[i].free();}
}

// Register the function as a benchmark
BENCHMARK(benchmark_gpu_loop_single_stream)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(12)->Arg(16);

#endif

} //namespace care

// Run the benchmarks
BENCHMARK_MAIN();
13 changes: 13 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,19 @@ target_include_directories(BenchmarkForall
blt_add_benchmark(NAME BenchmarkForall
COMMAND BenchmarkForall)

blt_add_executable(NAME BenchmarkRaja
SOURCES BenchmarkRaja.cpp
DEPENDS_ON ${care_benchmark_depends})

target_include_directories(BenchmarkRaja
PRIVATE ${PROJECT_SOURCE_DIR}/src)

target_include_directories(BenchmarkRaja
PRIVATE ${PROJECT_BINARY_DIR}/include)

blt_add_benchmark(NAME BenchmarkRaja
COMMAND BenchmarkRaja)

blt_add_executable(NAME BenchmarkNumeric
SOURCES BenchmarkNumeric.cpp
DEPENDS_ON ${care_benchmark_depends})
Expand Down
17 changes: 17 additions & 0 deletions src/care/DefaultMacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,10 @@

#define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)

#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK)

#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) CARE_CHECKED_OPENMP_FOR_LOOP_END(CHECK)

////////////////////////////////////////////////////////////////////////////////
///
/// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
Expand Down Expand Up @@ -548,6 +552,15 @@
#define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) }); \
CARE_NEST_END(CHECK) }}

#define CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, CHECK) { \
adayton1 marked this conversation as resolved.
Show resolved Hide resolved
if (END_INDEX > START_INDEX) { \
CARE_NEST_BEGIN(CHECK) \
care::forall_with_stream(care::gpu{}, RESOURCE, __FILE__, __LINE__, START_INDEX, END_INDEX, [=] CARE_DEVICE (const int INDEX) { \
CARE_SET_THREAD_ID(INDEX)

#define CARE_CHECKED_STREAMED_LOOP_END(CHECK) }); \
CARE_NEST_END(CHECK) }}

////////////////////////////////////////////////////////////////////////////////
///
/// @brief Macros that start and end a GPU RAJA loop of length one. If GPU is
Expand Down Expand Up @@ -753,6 +766,10 @@

#define CARE_PARALLEL_LOOP_END CARE_CHECKED_PARALLEL_LOOP_END(care_parallel_loop_check)

#define CARE_STREAMED_LOOP(RESOURCE, INDEX, START_INDEX, END_INDEX) CARE_CHECKED_STREAMED_LOOP_START(RESOURCE, INDEX, START_INDEX, END_INDEX, care_streamed_loop_check)

#define CARE_STREAMED_LOOP_END CARE_CHECKED_STREAMED_LOOP_END(care_streamed_loop_check)

////////////////////////////////////////////////////////////////////////////////
///
/// @brief Macros that start and end a RAJA loop that uses at least one
Expand Down
43 changes: 43 additions & 0 deletions src/care/forall.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,49 @@ namespace care {
#endif
}

////////////////////////////////////////////////////////////////////////////////
///
/// @author Neela Kausik
///
/// @brief If GPU is available, execute on the device. Otherwise, execute on
/// the host. This specialization is needed for clang-query.
///
/// @arg[in] gpu Used to choose this overload of forall
/// @arg[in] res Resource provided for execution
/// @arg[in] fileName The name of the file where this function is called
/// @arg[in] lineNumber The line number in the file where this function is called
/// @arg[in] start The starting index (inclusive)
/// @arg[in] end The ending index (exclusive)
/// @arg[in] body The loop body to execute at each index
///
////////////////////////////////////////////////////////////////////////////////

#if defined(CARE_GPUCC)
template <typename LB>
void forall_with_stream(gpu, RAJA::resources::Cuda res, const char * fileName, const int lineNumber,
const int start, const int end, LB&& body) {
#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
s_reverseLoopOrder = true;
#endif

#if CARE_ENABLE_GPU_SIMULATION_MODE
forall(gpu_simulation{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
#elif defined(__CUDACC__)
forall(RAJA::cuda_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
res, RAJA::RangeSegment(start, end), std::forward<LB>(body));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you need an overload of forall that takes a resource, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you mean?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the function you are calling: https://github.com/LLNL/CARE/pull/232/files#diff-1df40e04088de0f82501a0065752487396b8abeb4c3d30780e79119cc63789a7R74

But it does not accept a resource argument. I'm confused at how this is working.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought it was calling RAJA::forall, but will look into it further

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It shouldn't be calling RAJA::forall - there's no overload that takes the fileName and lineNumber.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, that's the main reason I dislike "using namespace..." statements - it's too easy to accidentally call the wrong function.

#elif defined(__HIPCC__)
forall(RAJA::hip_exec<CARE_CUDA_BLOCK_SIZE, CARE_CUDA_ASYNC>{},
res, RAJA::RangeSegment(start, end), std::forward<LB>(body));
#else
forall(RAJA::seq_exec{}, res, fileName, lineNumber, start, end, std::forward<LB>(body));
#endif

#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
s_reverseLoopOrder = false;
#endif
}
#endif

////////////////////////////////////////////////////////////////////////////////
///
/// @author Alan Dayton
Expand Down