Skip to content

Commit 3002448

Browse files
authored
some fixes to benchmark framework including weak scaling of wave equation (#577)
1 parent 7280f1f commit 3002448

File tree

9 files changed

+81
-42
lines changed

9 files changed

+81
-42
lines changed

.github/workflows/pr.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,12 +114,12 @@ jobs:
114114
run: pip install src-python/drbench
115115
- name: Test
116116
run: srun -p pvc-shared scripts/devcloud-test.sh
117-
- name: Upload build logs
118-
uses: actions/upload-artifact@v3
119-
if: always()
120-
with:
121-
name: log-icpx
122-
path: build/Testing
117+
# - name: Upload build logs
118+
# uses: actions/upload-artifact@v3
119+
# if: always()
120+
# with:
121+
# name: log-icpx
122+
# path: build/Testing
123123

124124
publish:
125125
needs: [checks, clang, gcc, icpx]

benchmarks/gbench/CMakeLists.txt

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,53 +28,55 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
2828
if(ENABLE_SYCL)
2929
add_subdirectory(shp)
3030

31+
add_custom_target(xhp-bench DEPENDS mhp-bench shp-bench)
32+
3133
add_custom_target(devcloud-bench DEPENDS devcloud-bench-results)
3234
add_custom_command(
3335
OUTPUT devcloud-bench-results
3436
COMMAND dr-bench clean
3537
COMMAND dr-bench suite --no-p2p --gpus 4 --sockets 2 --cores-per-socket 56
3638
COMMAND dr-bench plot
37-
DEPENDS mhp-bench shp-bench)
39+
DEPENDS xhp-bench)
3840

3941
add_custom_target(aurora-bench DEPENDS aurora-bench-results)
4042
add_custom_command(
4143
OUTPUT aurora-bench-results
4244
COMMAND dr-bench clean
4345
COMMAND dr-bench suite --gpus 12
4446
COMMAND dr-bench plot
45-
DEPENDS mhp-bench shp-bench)
47+
DEPENDS xhp-bench)
4648

4749
add_custom_target(aurora-bench-2 DEPENDS aurora-bench-2-results)
4850
add_custom_command(
4951
OUTPUT aurora-bench-2-results
5052
COMMAND dr-bench clean --prefix aurora-2
5153
COMMAND dr-bench suite --prefix aurora-2 --nodes 2 --gpus 12
5254
COMMAND dr-bench plot --prefix aurora-2
53-
DEPENDS mhp-bench shp-bench)
55+
DEPENDS xhp-bench)
5456

5557
add_custom_target(aurora-bench-4 DEPENDS aurora-bench-4-results)
5658
add_custom_command(
5759
OUTPUT aurora-bench-4-results
5860
COMMAND dr-bench clean --prefix aurora-4
5961
COMMAND dr-bench suite --prefix aurora-4 --nodes 4 --gpus 12
6062
COMMAND dr-bench plot --prefix aurora-4
61-
DEPENDS mhp-bench shp-bench)
63+
DEPENDS xhp-bench)
6264

6365
add_custom_target(quick-bench-gpu DEPENDS quick-bench-gpu-results)
6466
add_custom_command(
6567
OUTPUT quick-bench-gpu-results
6668
COMMAND dr-bench clean
6769
COMMAND dr-bench suite --reps 10 --gpus 2
6870
COMMAND dr-bench plot
69-
DEPENDS mhp-bench shp-bench)
71+
DEPENDS xhp-bench)
7072

7173
add_custom_target(quick-bench-cpu DEPENDS quick-bench-cpu-results)
7274
add_custom_command(
7375
OUTPUT quick-bench-cpu-results
7476
COMMAND dr-bench clean
7577
COMMAND dr-bench suite --reps 10 --cores-per-socket 4
7678
COMMAND dr-bench plot
77-
DEPENDS mhp-bench shp-bench)
79+
DEPENDS xhp-bench)
7880

7981
endif()
8082
endif()

benchmarks/gbench/mhp/mhp-bench.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class NullReporter : public ::benchmark::BenchmarkReporter {
3131
void dr_init() {
3232
#ifdef SYCL_LANGUAGE_VERSION
3333
if (options.count("sycl")) {
34-
sycl::queue q = dr::mhp::select_queue();
34+
sycl::queue q = dr::mhp::select_queue(options.count("different-devices"));
3535
benchmark::AddCustomContext("device_info", device_info(q.get_device()));
3636
dr::mhp::init(q);
3737
return;
@@ -75,6 +75,7 @@ int main(int argc, char *argv[]) {
7575
("log", "Enable logging")
7676
#ifdef SYCL_LANGUAGE_VERSION
7777
("sycl", "Execute on SYCL device")
78+
("different-devices", "ensure no multiple ranks on one device")
7879
#endif
7980
("reps", "Debug repetitions for short duration vector operations", cxxopts::value<std::size_t>()->default_value("1"))
8081
("rows", "Number of rows", cxxopts::value<std::size_t>()->default_value("10000"))

benchmarks/gbench/mhp/wave_equation.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,7 @@ int main(int argc, char *argv[]) {
623623
("n", "Grid size", cxxopts::value<std::size_t>()->default_value("128"))
624624
("t,benchmark-mode", "Run a fixed number of time steps.", cxxopts::value<bool>()->default_value("false"))
625625
("sycl", "Execute on SYCL device")
626+
("l,log", "enable logging")
626627
("f,fused-kernel", "Use fused kernels.", cxxopts::value<bool>()->default_value("false"))
627628
("h,help", "Print help");
628629
// clang-format on
@@ -635,6 +636,12 @@ int main(int argc, char *argv[]) {
635636
exit(1);
636637
}
637638

639+
std::unique_ptr<std::ofstream> logfile;
640+
if (options.count("log")) {
641+
logfile.reset(new std::ofstream(fmt::format("dr.{}.log", comm_rank)));
642+
dr::drlog.set_file(*logfile);
643+
}
644+
638645
if (options.count("sycl")) {
639646
#ifdef SYCL_LANGUAGE_VERSION
640647
sycl::queue q = dr::mhp::select_queue();
@@ -666,7 +673,13 @@ int main(int argc, char *argv[]) {
666673

667674
static void WaveEquation_DR(benchmark::State &state) {
668675

669-
int n = 4000;
676+
int n = ::sqrtl(default_vector_size);
677+
678+
// ugly hack to make it working in reasonable time in benchmarking framework
679+
// drbench.py should specify right size or there should be another size option
680+
// to use here instead of default_vector_size
681+
n /= 4;
682+
670683
std::size_t nread, nwrite, nflop;
671684
WaveEquation::calculate_complexity(n, n, nread, nwrite, nflop);
672685
Stats stats(state, nread, nwrite, nflop);

examples/mhp/dot_product_benchmark.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ namespace mhp = dr::mhp;
1919

2020
using T = double;
2121

22-
MPI_Comm comm;
2322
std::size_t comm_rank;
2423
std::size_t comm_size;
2524

@@ -134,10 +133,9 @@ void stats(auto &durations, auto &sum, auto v_serial, auto &x_local,
134133

135134
int main(int argc, char **argv) {
136135
MPI_Init(&argc, &argv);
137-
comm = MPI_COMM_WORLD;
138136
int rank, size;
139-
MPI_Comm_rank(comm, &rank);
140-
MPI_Comm_size(comm, &size);
137+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
138+
MPI_Comm_size(MPI_COMM_WORLD, &size);
141139
comm_rank = rank;
142140
comm_size = size;
143141

@@ -171,7 +169,7 @@ int main(int argc, char **argv) {
171169
}
172170
dr::drlog.debug("Rank: {}\n", comm_rank);
173171

174-
sycl::queue q = mhp::select_queue(comm);
172+
sycl::queue q = mhp::select_queue();
175173
if (options.count("sycl")) {
176174
mhp::init(q);
177175
} else {

include/dr/mhp/global.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ inline std::string hostname() {
129129
inline sycl::queue &sycl_queue() { return __detail::gcontext()->sycl_queue_; }
130130
inline auto dpl_policy() { return __detail::gcontext()->dpl_policy_; }
131131

132-
inline sycl::queue select_queue(MPI_Comm comm = MPI_COMM_WORLD) {
132+
inline sycl::queue select_queue(bool check_different_devices = false) {
133133
std::vector<sycl::device> devices;
134134

135135
auto root_devices = sycl::platform().get_devices();
@@ -156,9 +156,11 @@ inline sycl::queue select_queue(MPI_Comm comm = MPI_COMM_WORLD) {
156156
}
157157

158158
assert(rng::size(devices) > 0);
159+
const auto my_rank = dr::communicator(MPI_COMM_WORLD).rank();
160+
assert(!check_different_devices || my_rank < rng::size(devices));
161+
159162
// Round robin assignment of devices to ranks
160-
return sycl::queue(
161-
devices[dr::communicator(comm).rank() % rng::size(devices)]);
163+
return sycl::queue(devices[my_rank % rng::size(devices)]);
162164
}
163165

164166
inline void init(sycl::queue q) {
@@ -171,7 +173,7 @@ inline void init(sycl::queue q) {
171173
template <typename Selector = decltype(sycl::default_selector_v)>
172174
inline void init(Selector &&selector = sycl::default_selector_v) {
173175
__detail::initialize_mpi();
174-
sycl::queue q = mhp::select_queue(MPI_COMM_WORLD);
176+
sycl::queue q = mhp::select_queue();
175177
init(q);
176178
}
177179

src-python/drbench/drbench/drbench.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def __init__(self):
2020
self.mhp_bench = None
2121
self.shp_bench = None
2222
self.weak_scaling = False
23+
self.different_devices = False
2324
self.ranks = 1
2425
self.ranks_per_node = None
2526
self.target = None
@@ -62,6 +63,13 @@ def __init__(self):
6263
help="Scales the vector size by the number of ranks",
6364
)
6465

66+
option_different_devices = click.option(
67+
"--different-devices",
68+
is_flag=True,
69+
default=False,
70+
help="Ensures there are not multiple ranks on one SYCL device",
71+
)
72+
6573

6674
# common arguments
6775
@click.group()
@@ -106,6 +114,7 @@ def do_run(options):
106114
options.mhp_bench,
107115
options.shp_bench,
108116
options.weak_scaling,
117+
options.different_devices,
109118
options.ranks_per_node,
110119
)
111120
)
@@ -165,6 +174,7 @@ def do_run(options):
165174
@option_dry_run
166175
@option_clean
167176
@option_weak_scaling
177+
@option_different_devices
168178
def run(
169179
prefix,
170180
target,
@@ -180,6 +190,7 @@ def run(
180190
dry_run,
181191
clean,
182192
weak_scaling,
193+
different_devices,
183194
):
184195
assert target
185196
assert vec_size
@@ -212,6 +223,7 @@ def run(
212223
options.mhp_bench = mhp_bench
213224
options.shp_bench = shp_bench
214225
options.weak_scaling = weak_scaling
226+
options.different_devices = different_devices
215227
options.dry_run = dry_run
216228

217229
do_run(options)
@@ -224,6 +236,7 @@ def run(
224236
@option_dry_run
225237
@option_clean
226238
@option_weak_scaling
239+
@option_different_devices
227240
@click.option(
228241
"--vec-size",
229242
type=int,
@@ -279,6 +292,7 @@ def suite(
279292
sockets,
280293
cores_per_socket,
281294
weak_scaling,
295+
different_devices,
282296
):
283297
# Run a list of ranks
284298
def run_rank_list(base, ranks, filters, targets, weak_scaling=False):
@@ -447,6 +461,10 @@ def multi_node(base):
447461
base.vec_size = [vec_size]
448462
base.reps = reps
449463
base.weak_scaling = weak_scaling
464+
base.different_devices = different_devices
465+
466+
print(f"weak_scaling is {weak_scaling}\n")
467+
print(f"different_devices is {different_devices}\n")
450468

451469
# if the platform does not support p2p, limit gpus to 1
452470
if p2p:

src-python/drbench/drbench/plotter.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,19 @@ class Plotter:
1616
tbs_title = "Bandwidth (TB/s)"
1717
gbs_title = "Bandwidth (GB/s)"
1818
speedup_title = "Speedup"
19+
gpus_num_title = "Number of GPU Tiles"
20+
sockets_num_title = "Number of CPU Sockets"
21+
22+
device_info = {
23+
"GPU": {
24+
"x_title": gpus_num_title,
25+
"targets": ["MHP_SYCL_GPU", "SHP_SYCL_GPU"],
26+
},
27+
"CPU": {
28+
"x_title": sockets_num_title,
29+
"targets": ["MHP_SYCL_CPU", "MHP_DIRECT_CPU"],
30+
},
31+
}
1932

2033
@staticmethod
2134
def __name_target(bname, target, device):
@@ -78,9 +91,9 @@ def __import_file(fname: str, rows):
7891
"runtime": runtime,
7992
"device": device,
8093
"vsize": vsize,
81-
"Number of GPU Tiles": ranks,
94+
Plotter.gpus_num_title: ranks,
8295
"Number of CPU Cores": cpu_cores,
83-
"Number of CPU Sockets": cpu_sockets,
96+
Plotter.sockets_num_title: cpu_sockets,
8497
"rtime": rtime,
8598
}
8699
)
@@ -195,18 +208,8 @@ def __plot(
195208
"Stream_Triad": stream_info,
196209
}
197210

198-
device_info = {
199-
"GPU": {
200-
"x_title": "Number of GPU Tiles",
201-
"targets": ["MHP_SYCL_GPU", "SHP_SYCL_GPU"],
202-
},
203-
"CPU": {
204-
"x_title": "Number of CPU Sockets",
205-
"targets": ["MHP_SYCL_CPU", "MHP_DIRECT_CPU"],
206-
},
207-
}
208-
209-
def __x_domain(self, db, target, x_title):
211+
@staticmethod
212+
def __x_domain(db, target, x_title):
210213
points = db.loc[db["Target"] == target]
211214
val = points[x_title].values[0]
212215
last = points[x_title].values[-1]
@@ -283,15 +286,15 @@ def __speedup_plot(
283286

284287
db = self.db.copy()
285288
db = db.loc[(db["Benchmark"] == benchmark) & (db["device"] == device)]
286-
db = db.sort_values(by=["Benchmark", "Target", x_title])
289+
db = db.sort_values(by=["Target", x_title])
287290

288291
targets = self.__find_targets(db, device)
289292

290293
if db.shape[0] == 0 or len(targets) == 0:
291294
click.echo(f" no data for {benchmark} {device}")
292295
return
293296

294-
x_domain = self.__x_domain(db, targets[0], x_title)
297+
xy_domain = self.__x_domain(db, targets[0], x_title)
295298
db.to_csv(f"{fname}.csv")
296299

297300
reference = db.loc[
@@ -332,8 +335,8 @@ def __speedup_plot(
332335
benchmark,
333336
x_title,
334337
y_title,
335-
x_domain,
336-
x_domain,
338+
xy_domain,
339+
xy_domain,
337340
lines,
338341
fname,
339342
# display_perfect_scaling=False

src-python/drbench/drbench/runner.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"AnalysisConfig",
1515
(
1616
"prefix benchmark_filter reps dry_run mhp_bench shp_bench "
17-
"weak_scaling ranks_per_node"
17+
"weak_scaling different_devices ranks_per_node"
1818
),
1919
)
2020

@@ -31,6 +31,8 @@ def __execute(self, command: str):
3131
def __run_mhp_analysis(self, params, ranks, ranks_per_node, target):
3232
if target.runtime == Runtime.SYCL:
3333
params.append("--sycl")
34+
if self.analysis_config.different_devices:
35+
params.append("--different-devices")
3436
if target.device == Device.CPU:
3537
env = "ONEAPI_DEVICE_SELECTOR=opencl:cpu"
3638
else:

0 commit comments

Comments
 (0)