oneapi-src · lslusarczyk · Sep 8, 2023 · Sep 7, 2023 · Sep 7, 2023
diff --git a/benchmarks/gbench/CMakeLists.txt b/benchmarks/gbench/CMakeLists.txt
@@ -14,6 +14,14 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
     GIT_TAG v1.8.0)
   FetchContent_MakeAvailable(googlebench)
 
+  if(ENABLE_CUDA)
+    # because sort.cpp compilation fails with
+    # dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h warning:
+    # attribute argument 16 is invalid and will be ignored; CUDA requires
+    # sub_group size 32
+    add_compile_options(-Wno-error=cuda-compat)
+  endif()
+
   # mhp is not under ENABLE_SYCL to check bechmarks also compilation in gcc
   add_subdirectory(mhp)
 

diff --git a/benchmarks/gbench/common/dr_bench.hpp b/benchmarks/gbench/common/dr_bench.hpp
@@ -54,13 +54,18 @@ inline auto device_info(sycl::device device) {
 #ifdef BENCH_MHP
 #ifdef SYCL_LANGUAGE_VERSION
 
+inline sycl::context *mhp_global_context_ = nullptr;
+inline std::vector<sycl::device> devices;
+
 inline sycl::queue get_queue() {
-  std::vector<sycl::device> devices;
+  if (mhp_global_context_ != nullptr) {
+    return sycl::queue(*mhp_global_context_, devices[0]);
+  }
 
   auto root_devices = sycl::platform().get_devices();
 
-  for (auto &&root_device : root_devices) {
-    dr::drlog.debug("Root device: {}\n",
+  for (auto &&[idx, root_device] : rng::views::enumerate(root_devices)) {
+    dr::drlog.debug("Root device no {}: {}\n", idx,
                     root_device.get_info<sycl::info::device::name>());
     if (dr::__detail::partitionable(root_device)) {
       auto subdevices = root_device.create_sub_devices<
@@ -81,7 +86,9 @@ inline sycl::queue get_queue() {
   }
 
   assert(rng::size(devices) > 0);
-  return sycl::queue(devices[0]);
+
+  mhp_global_context_ = new sycl::context(devices);
+  return sycl::queue(*mhp_global_context_, devices[0]);
 }
 
 #endif

diff --git a/benchmarks/gbench/common/sort.cpp b/benchmarks/gbench/common/sort.cpp
@@ -14,11 +14,11 @@ template <rng::forward_range X> void fill_random(X &&x) {
 
 class DRSortFixture : public benchmark::Fixture {
 protected:
-  dr::shp::distributed_vector<T> *a;
+  xhp::distributed_vector<T> *a;
 
 public:
   void SetUp(::benchmark::State &) {
-    a = new dr::shp::distributed_vector<T>(default_vector_size);
+    a = new xhp::distributed_vector<T>(default_vector_size);
     fill_random(*a);
   }
 
@@ -29,27 +29,34 @@ BENCHMARK_DEFINE_F(DRSortFixture, Sort_DR)(benchmark::State &state) {
   Stats stats(state, sizeof(T) * a->size());
   for (auto _ : state) {
     state.PauseTiming();
-    dr::shp::distributed_vector<T> vec{*a};
+    xhp::distributed_vector<T> vec(a->size());
+    xhp::copy(*a, rng::begin(vec));
     stats.rep();
     state.ResumeTiming();
 
-    dr::shp::sort(vec);
+    // sort not implemented in mhp yet
+#ifdef BENCH_SHP
+    xhp::sort(vec);
+#endif
   }
 }
 
 DR_BENCHMARK_REGISTER_F(DRSortFixture, Sort_DR);
 
+#ifdef SYCL_LANGUAGE_VERSION
 class SyclSortFixture : public benchmark::Fixture {
 protected:
   std::vector<T> local_vec;
+
   sycl::queue queue;
   oneapi::dpl::execution::device_policy<> policy;
   T *vec;
 
 public:
   void SetUp(::benchmark::State &) {
+    dr::drlog.debug("setting up SyclSortFixture\n");
     // when using mhp's get_queue() long execution is observed in this test
-    // (probably due to JIT), now shp and shp use their own get_queue-s
+    // (probably due to JIT), now mhp and shp use their own get_queue-s
     queue = get_queue();
     policy = oneapi::dpl::execution::make_device_policy(queue);
     local_vec = std::vector<T>(default_vector_size);
@@ -58,6 +65,7 @@ class SyclSortFixture : public benchmark::Fixture {
   }
 
   void TearDown(::benchmark::State &state) {
+    dr::drlog.debug("tearing down SyclSortFixture\n");
     // copy back to check if last sort really sorted
     queue.memcpy(local_vec.data(), vec, default_vector_size * sizeof(T)).wait();
     sycl::free(vec, queue);
@@ -103,6 +111,7 @@ BENCHMARK_DEFINE_F(SyclSortFixture, Sort_DPL)(benchmark::State &state) {
 }
 
 DR_BENCHMARK_REGISTER_F(SyclSortFixture, Sort_DPL);
+#endif
 
 class StdSortFixture : public benchmark::Fixture {
 protected:

diff --git a/benchmarks/gbench/mhp/CMakeLists.txt b/benchmarks/gbench/mhp/CMakeLists.txt
@@ -11,6 +11,7 @@ add_executable(
   ../common/distributed_vector.cpp
   ../common/dot_product.cpp
   ../common/inclusive_scan.cpp
+  ../common/sort.cpp
   ../common/stream.cpp
   wave_equation.cpp
   rooted.cpp