some fixes to benchmark framework including weak scaling of wave equation (#577)

lslusarczyk · web-flow · commit 3002448eb1c4 · 2023-10-14T18:45:20.000Z
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
@@ -114,12 +114,12 @@ jobs:
       run: pip install src-python/drbench
     - name: Test
       run: srun -p pvc-shared scripts/devcloud-test.sh
-    - name: Upload build logs
-      uses: actions/upload-artifact@v3
-      if: always()
-      with:
-        name: log-icpx
-        path: build/Testing
+#    - name: Upload build logs
+#      uses: actions/upload-artifact@v3
+#      if: always()
+#      with:
+#        name: log-icpx
+#        path: build/Testing
 
   publish:
     needs: [checks, clang, gcc, icpx]
diff --git a/benchmarks/gbench/CMakeLists.txt b/benchmarks/gbench/CMakeLists.txt
@@ -28,53 +28,55 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
   if(ENABLE_SYCL)
     add_subdirectory(shp)
 
+    add_custom_target(xhp-bench DEPENDS mhp-bench shp-bench)
+
     add_custom_target(devcloud-bench DEPENDS devcloud-bench-results)
     add_custom_command(
       OUTPUT devcloud-bench-results
       COMMAND dr-bench clean
       COMMAND dr-bench suite --no-p2p --gpus 4 --sockets 2 --cores-per-socket 56
       COMMAND dr-bench plot
-      DEPENDS mhp-bench shp-bench)
+      DEPENDS xhp-bench)
 
     add_custom_target(aurora-bench DEPENDS aurora-bench-results)
     add_custom_command(
       OUTPUT aurora-bench-results
       COMMAND dr-bench clean
       COMMAND dr-bench suite --gpus 12
       COMMAND dr-bench plot
-      DEPENDS mhp-bench shp-bench)
+      DEPENDS xhp-bench)
 
     add_custom_target(aurora-bench-2 DEPENDS aurora-bench-2-results)
     add_custom_command(
       OUTPUT aurora-bench-2-results
       COMMAND dr-bench clean --prefix aurora-2
       COMMAND dr-bench suite --prefix aurora-2 --nodes 2 --gpus 12
       COMMAND dr-bench plot --prefix aurora-2
-      DEPENDS mhp-bench shp-bench)
+      DEPENDS xhp-bench)
 
     add_custom_target(aurora-bench-4 DEPENDS aurora-bench-4-results)
     add_custom_command(
       OUTPUT aurora-bench-4-results
       COMMAND dr-bench clean --prefix aurora-4
       COMMAND dr-bench suite --prefix aurora-4 --nodes 4 --gpus 12
       COMMAND dr-bench plot --prefix aurora-4
-      DEPENDS mhp-bench shp-bench)
+      DEPENDS xhp-bench)
 
     add_custom_target(quick-bench-gpu DEPENDS quick-bench-gpu-results)
     add_custom_command(
       OUTPUT quick-bench-gpu-results
       COMMAND dr-bench clean
       COMMAND dr-bench suite --reps 10 --gpus 2
       COMMAND dr-bench plot
-      DEPENDS mhp-bench shp-bench)
+      DEPENDS xhp-bench)
 
     add_custom_target(quick-bench-cpu DEPENDS quick-bench-cpu-results)
     add_custom_command(
       OUTPUT quick-bench-cpu-results
       COMMAND dr-bench clean
       COMMAND dr-bench suite --reps 10 --cores-per-socket 4
       COMMAND dr-bench plot
-      DEPENDS mhp-bench shp-bench)
+      DEPENDS xhp-bench)
 
   endif()
 endif()
diff --git a/benchmarks/gbench/mhp/mhp-bench.cpp b/benchmarks/gbench/mhp/mhp-bench.cpp
@@ -31,7 +31,7 @@ class NullReporter : public ::benchmark::BenchmarkReporter {
 void dr_init() {
 #ifdef SYCL_LANGUAGE_VERSION
   if (options.count("sycl")) {
-    sycl::queue q = dr::mhp::select_queue();
+    sycl::queue q = dr::mhp::select_queue(options.count("different-devices"));
     benchmark::AddCustomContext("device_info", device_info(q.get_device()));
     dr::mhp::init(q);
     return;
@@ -75,6 +75,7 @@ int main(int argc, char *argv[]) {
     ("log", "Enable logging")
 #ifdef SYCL_LANGUAGE_VERSION
     ("sycl", "Execute on SYCL device")
+    ("different-devices", "ensure no multiple ranks on one device")
 #endif
     ("reps", "Debug repetitions for short duration vector operations", cxxopts::value<std::size_t>()->default_value("1"))
     ("rows", "Number of rows", cxxopts::value<std::size_t>()->default_value("10000"))
diff --git a/benchmarks/gbench/mhp/wave_equation.cpp b/benchmarks/gbench/mhp/wave_equation.cpp
@@ -623,6 +623,7 @@ int main(int argc, char *argv[]) {
     ("n", "Grid size", cxxopts::value<std::size_t>()->default_value("128"))
     ("t,benchmark-mode", "Run a fixed number of time steps.", cxxopts::value<bool>()->default_value("false"))
     ("sycl", "Execute on SYCL device")
+    ("l,log", "enable logging")
     ("f,fused-kernel", "Use fused kernels.", cxxopts::value<bool>()->default_value("false"))
     ("h,help", "Print help");
   // clang-format on
@@ -635,6 +636,12 @@ int main(int argc, char *argv[]) {
     exit(1);
   }
 
+  std::unique_ptr<std::ofstream> logfile;
+  if (options.count("log")) {
+    logfile.reset(new std::ofstream(fmt::format("dr.{}.log", comm_rank)));
+    dr::drlog.set_file(*logfile);
+  }
+
   if (options.count("sycl")) {
 #ifdef SYCL_LANGUAGE_VERSION
     sycl::queue q = dr::mhp::select_queue();
@@ -666,7 +673,13 @@ int main(int argc, char *argv[]) {
 
 static void WaveEquation_DR(benchmark::State &state) {
 
-  int n = 4000;
+  int n = ::sqrtl(default_vector_size);
+
+  // ugly hack to make it working in reasonable time in benchmarking framework
+  // drbench.py should specify right size or there should be another size option
+  // to use here instead of default_vector_size
+  n /= 4;
+
   std::size_t nread, nwrite, nflop;
   WaveEquation::calculate_complexity(n, n, nread, nwrite, nflop);
   Stats stats(state, nread, nwrite, nflop);
diff --git a/examples/mhp/dot_product_benchmark.cpp b/examples/mhp/dot_product_benchmark.cpp
@@ -19,7 +19,6 @@ namespace mhp = dr::mhp;
 
 using T = double;
 
-MPI_Comm comm;
 std::size_t comm_rank;
 std::size_t comm_size;
 
@@ -134,10 +133,9 @@ void stats(auto &durations, auto &sum, auto v_serial, auto &x_local,
 
 int main(int argc, char **argv) {
   MPI_Init(&argc, &argv);
-  comm = MPI_COMM_WORLD;
   int rank, size;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
   comm_rank = rank;
   comm_size = size;
 
@@ -171,7 +169,7 @@ int main(int argc, char **argv) {
   }
   dr::drlog.debug("Rank: {}\n", comm_rank);
 
-  sycl::queue q = mhp::select_queue(comm);
+  sycl::queue q = mhp::select_queue();
   if (options.count("sycl")) {
     mhp::init(q);
   } else {
diff --git a/include/dr/mhp/global.hpp b/include/dr/mhp/global.hpp
@@ -129,7 +129,7 @@ inline std::string hostname() {
 inline sycl::queue &sycl_queue() { return __detail::gcontext()->sycl_queue_; }
 inline auto dpl_policy() { return __detail::gcontext()->dpl_policy_; }
 
-inline sycl::queue select_queue(MPI_Comm comm = MPI_COMM_WORLD) {
+inline sycl::queue select_queue(bool check_different_devices = false) {
   std::vector<sycl::device> devices;
 
   auto root_devices = sycl::platform().get_devices();
@@ -156,9 +156,11 @@ inline sycl::queue select_queue(MPI_Comm comm = MPI_COMM_WORLD) {
   }
 
   assert(rng::size(devices) > 0);
+  const auto my_rank = dr::communicator(MPI_COMM_WORLD).rank();
+  assert(!check_different_devices || my_rank < rng::size(devices));
+
   // Round robin assignment of devices to ranks
-  return sycl::queue(
-      devices[dr::communicator(comm).rank() % rng::size(devices)]);
+  return sycl::queue(devices[my_rank % rng::size(devices)]);
 }
 
 inline void init(sycl::queue q) {
@@ -171,7 +173,7 @@ inline void init(sycl::queue q) {
 template <typename Selector = decltype(sycl::default_selector_v)>
 inline void init(Selector &&selector = sycl::default_selector_v) {
   __detail::initialize_mpi();
-  sycl::queue q = mhp::select_queue(MPI_COMM_WORLD);
+  sycl::queue q = mhp::select_queue();
   init(q);
 }
 
diff --git a/src-python/drbench/drbench/drbench.py b/src-python/drbench/drbench/drbench.py
@@ -20,6 +20,7 @@ def __init__(self):
         self.mhp_bench = None
         self.shp_bench = None
         self.weak_scaling = False
+        self.different_devices = False
         self.ranks = 1
         self.ranks_per_node = None
         self.target = None
@@ -62,6 +63,13 @@ def __init__(self):
     help="Scales the vector size by the number of ranks",
 )
 
+option_different_devices = click.option(
+    "--different-devices",
+    is_flag=True,
+    default=False,
+    help="Ensures there are not multiple ranks on one SYCL device",
+)
+
 
 # common arguments
 @click.group()
@@ -106,6 +114,7 @@ def do_run(options):
             options.mhp_bench,
             options.shp_bench,
             options.weak_scaling,
+            options.different_devices,
             options.ranks_per_node,
         )
     )
@@ -165,6 +174,7 @@ def do_run(options):
 @option_dry_run
 @option_clean
 @option_weak_scaling
+@option_different_devices
 def run(
     prefix,
     target,
@@ -180,6 +190,7 @@ def run(
     dry_run,
     clean,
     weak_scaling,
+    different_devices,
 ):
     assert target
     assert vec_size
@@ -212,6 +223,7 @@ def run(
     options.mhp_bench = mhp_bench
     options.shp_bench = shp_bench
     options.weak_scaling = weak_scaling
+    options.different_devices = different_devices
     options.dry_run = dry_run
 
     do_run(options)
@@ -224,6 +236,7 @@ def run(
 @option_dry_run
 @option_clean
 @option_weak_scaling
+@option_different_devices
 @click.option(
     "--vec-size",
     type=int,
@@ -279,6 +292,7 @@ def suite(
     sockets,
     cores_per_socket,
     weak_scaling,
+    different_devices,
 ):
     # Run a list of ranks
     def run_rank_list(base, ranks, filters, targets, weak_scaling=False):
@@ -447,6 +461,10 @@ def multi_node(base):
     base.vec_size = [vec_size]
     base.reps = reps
     base.weak_scaling = weak_scaling
+    base.different_devices = different_devices
+
+    print(f"weak_scaling is {weak_scaling}\n")
+    print(f"different_devices is {different_devices}\n")
 
     # if the platform does not support p2p, limit gpus to 1
     if p2p:
diff --git a/src-python/drbench/drbench/plotter.py b/src-python/drbench/drbench/plotter.py
@@ -16,6 +16,19 @@ class Plotter:
     tbs_title = "Bandwidth (TB/s)"
     gbs_title = "Bandwidth (GB/s)"
     speedup_title = "Speedup"
+    gpus_num_title = "Number of GPU Tiles"
+    sockets_num_title = "Number of CPU Sockets"
+
+    device_info = {
+        "GPU": {
+            "x_title": gpus_num_title,
+            "targets": ["MHP_SYCL_GPU", "SHP_SYCL_GPU"],
+        },
+        "CPU": {
+            "x_title": sockets_num_title,
+            "targets": ["MHP_SYCL_CPU", "MHP_DIRECT_CPU"],
+        },
+    }
 
     @staticmethod
     def __name_target(bname, target, device):
@@ -78,9 +91,9 @@ def __import_file(fname: str, rows):
                         "runtime": runtime,
                         "device": device,
                         "vsize": vsize,
-                        "Number of GPU Tiles": ranks,
+                        Plotter.gpus_num_title: ranks,
                         "Number of CPU Cores": cpu_cores,
-                        "Number of CPU Sockets": cpu_sockets,
+                        Plotter.sockets_num_title: cpu_sockets,
                         "rtime": rtime,
                     }
                 )
@@ -195,18 +208,8 @@ def __plot(
         "Stream_Triad": stream_info,
     }
 
-    device_info = {
-        "GPU": {
-            "x_title": "Number of GPU Tiles",
-            "targets": ["MHP_SYCL_GPU", "SHP_SYCL_GPU"],
-        },
-        "CPU": {
-            "x_title": "Number of CPU Sockets",
-            "targets": ["MHP_SYCL_CPU", "MHP_DIRECT_CPU"],
-        },
-    }
-
-    def __x_domain(self, db, target, x_title):
+    @staticmethod
+    def __x_domain(db, target, x_title):
         points = db.loc[db["Target"] == target]
         val = points[x_title].values[0]
         last = points[x_title].values[-1]
@@ -283,15 +286,15 @@ def __speedup_plot(
 
         db = self.db.copy()
         db = db.loc[(db["Benchmark"] == benchmark) & (db["device"] == device)]
-        db = db.sort_values(by=["Benchmark", "Target", x_title])
+        db = db.sort_values(by=["Target", x_title])
 
         targets = self.__find_targets(db, device)
 
         if db.shape[0] == 0 or len(targets) == 0:
             click.echo(f"  no data for {benchmark} {device}")
             return
 
-        x_domain = self.__x_domain(db, targets[0], x_title)
+        xy_domain = self.__x_domain(db, targets[0], x_title)
         db.to_csv(f"{fname}.csv")
 
         reference = db.loc[
@@ -332,8 +335,8 @@ def __speedup_plot(
             benchmark,
             x_title,
             y_title,
-            x_domain,
-            x_domain,
+            xy_domain,
+            xy_domain,
             lines,
             fname,
             # display_perfect_scaling=False
diff --git a/src-python/drbench/drbench/runner.py b/src-python/drbench/drbench/runner.py
@@ -14,7 +14,7 @@
     "AnalysisConfig",
     (
         "prefix benchmark_filter reps dry_run mhp_bench shp_bench "
-        "weak_scaling ranks_per_node"
+        "weak_scaling different_devices ranks_per_node"
     ),
 )
 
@@ -31,6 +31,8 @@ def __execute(self, command: str):
     def __run_mhp_analysis(self, params, ranks, ranks_per_node, target):
         if target.runtime == Runtime.SYCL:
             params.append("--sycl")
+            if self.analysis_config.different_devices:
+                params.append("--different-devices")
             if target.device == Device.CPU:
                 env = "ONEAPI_DEVICE_SELECTOR=opencl:cpu"
             else: