Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enabled sort benchmarks in mhp, fixed slow benchmarks in mhp #547

Merged
merged 2 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions benchmarks/gbench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
GIT_TAG v1.8.0)
FetchContent_MakeAvailable(googlebench)

if(ENABLE_CUDA)
# because sort.cpp compilation fails with
# dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h warning:
# attribute argument 16 is invalid and will be ignored; CUDA requires
# sub_group size 32
add_compile_options(-Wno-error=cuda-compat)
endif()

# mhp is not under ENABLE_SYCL to check bechmarks also compilation in gcc
add_subdirectory(mhp)

Expand Down
15 changes: 11 additions & 4 deletions benchmarks/gbench/common/dr_bench.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,18 @@ inline auto device_info(sycl::device device) {
#ifdef BENCH_MHP
#ifdef SYCL_LANGUAGE_VERSION

inline sycl::context *mhp_global_context_ = nullptr;
inline std::vector<sycl::device> devices;

inline sycl::queue get_queue() {
std::vector<sycl::device> devices;
if (mhp_global_context_ != nullptr) {
return sycl::queue(*mhp_global_context_, devices[0]);
}

auto root_devices = sycl::platform().get_devices();

for (auto &&root_device : root_devices) {
dr::drlog.debug("Root device: {}\n",
for (auto &&[idx, root_device] : rng::views::enumerate(root_devices)) {
dr::drlog.debug("Root device no {}: {}\n", idx,
root_device.get_info<sycl::info::device::name>());
if (dr::__detail::partitionable(root_device)) {
auto subdevices = root_device.create_sub_devices<
Expand All @@ -81,7 +86,9 @@ inline sycl::queue get_queue() {
}

assert(rng::size(devices) > 0);
return sycl::queue(devices[0]);

mhp_global_context_ = new sycl::context(devices);
return sycl::queue(*mhp_global_context_, devices[0]);
}

#endif
Expand Down
19 changes: 14 additions & 5 deletions benchmarks/gbench/common/sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ template <rng::forward_range X> void fill_random(X &&x) {

class DRSortFixture : public benchmark::Fixture {
protected:
dr::shp::distributed_vector<T> *a;
xhp::distributed_vector<T> *a;

public:
void SetUp(::benchmark::State &) {
a = new dr::shp::distributed_vector<T>(default_vector_size);
a = new xhp::distributed_vector<T>(default_vector_size);
fill_random(*a);
}

Expand All @@ -29,27 +29,34 @@ BENCHMARK_DEFINE_F(DRSortFixture, Sort_DR)(benchmark::State &state) {
Stats stats(state, sizeof(T) * a->size());
for (auto _ : state) {
state.PauseTiming();
dr::shp::distributed_vector<T> vec{*a};
xhp::distributed_vector<T> vec(a->size());
xhp::copy(*a, rng::begin(vec));
stats.rep();
state.ResumeTiming();

dr::shp::sort(vec);
// sort not implemented in mhp yet
#ifdef BENCH_SHP
xhp::sort(vec);
#endif
}
}

DR_BENCHMARK_REGISTER_F(DRSortFixture, Sort_DR);

#ifdef SYCL_LANGUAGE_VERSION
class SyclSortFixture : public benchmark::Fixture {
protected:
std::vector<T> local_vec;

sycl::queue queue;
oneapi::dpl::execution::device_policy<> policy;
T *vec;

public:
void SetUp(::benchmark::State &) {
dr::drlog.debug("setting up SyclSortFixture\n");
// when using mhp's get_queue() long execution is observed in this test
// (probably due to JIT), now shp and shp use their own get_queue-s
// (probably due to JIT), now mhp and shp use their own get_queue-s
queue = get_queue();
policy = oneapi::dpl::execution::make_device_policy(queue);
local_vec = std::vector<T>(default_vector_size);
Expand All @@ -58,6 +65,7 @@ class SyclSortFixture : public benchmark::Fixture {
}

void TearDown(::benchmark::State &state) {
dr::drlog.debug("tearing down SyclSortFixture\n");
// copy back to check if last sort really sorted
queue.memcpy(local_vec.data(), vec, default_vector_size * sizeof(T)).wait();
sycl::free(vec, queue);
Expand Down Expand Up @@ -103,6 +111,7 @@ BENCHMARK_DEFINE_F(SyclSortFixture, Sort_DPL)(benchmark::State &state) {
}

DR_BENCHMARK_REGISTER_F(SyclSortFixture, Sort_DPL);
#endif

class StdSortFixture : public benchmark::Fixture {
protected:
Expand Down
1 change: 1 addition & 0 deletions benchmarks/gbench/mhp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ add_executable(
../common/distributed_vector.cpp
../common/dot_product.cpp
../common/inclusive_scan.cpp
../common/sort.cpp
../common/stream.cpp
wave_equation.cpp
rooted.cpp
Expand Down