Skip to content

Commit 1083144

Browse files
authored
0.5.0 release (#67)
* Minor updates to clang-format and formatting source. * Update version in headers and documentation.
1 parent 5df1b2c commit 1083144

File tree

19 files changed

+486
-417
lines changed

19 files changed

+486
-417
lines changed

.clang-format

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
---
2-
AllowShortIfStatementsOnASingleLine: true
32
BasedOnStyle: LLVM
43
ColumnLimit: 120
54
CommentPragmas: '^\\.+'
@@ -12,5 +11,6 @@ AlignTrailingComments: true
1211
AllowShortBlocksOnASingleLine: true
1312
AllowShortCaseLabelsOnASingleLine : true
1413
AllowShortIfStatementsOnASingleLine: true
15-
AllowShortLoopsOnASingleLine: true
14+
AllowShortLoopsOnASingleLine: false
15+
PenaltyBreakBeforeFirstCallParameter: 100
1616
...

benchmark/benchmark.cu

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,9 @@ template <typename T> static std::vector<T> process_timings(std::vector<T> times
5757
CHECK_MPI_EXIT(MPI_Comm_size(MPI_COMM_WORLD, &nranks));
5858
t_avg /= nranks;
5959

60-
for (auto& t : times) { t = (t - t_avg) * (t - t_avg); }
60+
for (auto& t : times) {
61+
t = (t - t_avg) * (t - t_avg);
62+
}
6163
double t_var = std::accumulate(times.begin(), times.end(), T(0)) / times.size();
6264
CHECK_MPI_EXIT(MPI_Allreduce(MPI_IN_PLACE, &t_var, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD));
6365
t_var /= nranks;
@@ -222,7 +224,9 @@ int main(int argc, char** argv) {
222224
cudecompGridDescAutotuneOptions_t options;
223225
CHECK_CUDECOMP_EXIT(cudecompGridDescAutotuneOptionsSetDefaults(&options));
224226
options.dtype = get_cudecomp_datatype(complex_t(0));
225-
for (int i = 0; i < 4; ++i) { options.transpose_use_inplace_buffers[i] = !out_of_place; }
227+
for (int i = 0; i < 4; ++i) {
228+
options.transpose_use_inplace_buffers[i] = !out_of_place;
229+
}
226230

227231
if (comm_backend != 0) {
228232
config.transpose_comm_backend = comm_backend;
@@ -306,20 +310,20 @@ int main(int argc, char** argv) {
306310
if (!no_slab_opt && config.pdims[0] == 1 && config.pdims[1] == 1) {
307311
// single rank, x-y-z slab: use 3D FFT
308312
slab_xyz = true;
309-
CHECK_CUFFT_EXIT(cufftMakePlan3d(cufft_plan_r2c_x, gx, gy, gz, get_cufft_type_r2c(real_t(0)),
310-
&work_sz_r2c_x));
311-
CHECK_CUFFT_EXIT(cufftMakePlan3d(cufft_plan_c2r_x, gx, gy, gz, get_cufft_type_c2r(real_t(0)),
312-
&work_sz_c2r_x));
313+
CHECK_CUFFT_EXIT(cufftMakePlan3d(cufft_plan_r2c_x, gx, gy, gz, get_cufft_type_r2c(real_t(0)), &work_sz_r2c_x));
314+
CHECK_CUFFT_EXIT(cufftMakePlan3d(cufft_plan_c2r_x, gx, gy, gz, get_cufft_type_c2r(real_t(0)), &work_sz_c2r_x));
313315
} else if (!no_slab_opt && config.pdims[0] == 1) {
314316
// x-y slab: use 2D FFT
315317
slab_xy = true;
316318
std::array<int, 2> n{gx, gy};
317-
CHECK_CUFFT_EXIT(cufftMakePlanMany(
318-
cufft_plan_r2c_x, 2, n.data(), nullptr, 1, pinfo_x_r.shape[0] * pinfo_x_r.shape[1], nullptr, 1,
319-
pinfo_x_c.shape[0] * pinfo_x_c.shape[1], get_cufft_type_r2c(real_t(0)), pinfo_x_r.shape[2], &work_sz_r2c_x));
320-
CHECK_CUFFT_EXIT(cufftMakePlanMany(
321-
cufft_plan_c2r_x, 2, n.data(), nullptr, 1, pinfo_x_c.shape[0] * pinfo_x_c.shape[1], nullptr, 1,
322-
pinfo_x_r.shape[0] * pinfo_x_r.shape[1], get_cufft_type_c2r(real_t(0)), pinfo_x_c.shape[2], &work_sz_c2r_x));
319+
CHECK_CUFFT_EXIT(cufftMakePlanMany(cufft_plan_r2c_x, 2, n.data(), nullptr, 1,
320+
pinfo_x_r.shape[0] * pinfo_x_r.shape[1], nullptr, 1,
321+
pinfo_x_c.shape[0] * pinfo_x_c.shape[1], get_cufft_type_r2c(real_t(0)),
322+
pinfo_x_r.shape[2], &work_sz_r2c_x));
323+
CHECK_CUFFT_EXIT(cufftMakePlanMany(cufft_plan_c2r_x, 2, n.data(), nullptr, 1,
324+
pinfo_x_c.shape[0] * pinfo_x_c.shape[1], nullptr, 1,
325+
pinfo_x_r.shape[0] * pinfo_x_r.shape[1], get_cufft_type_c2r(real_t(0)),
326+
pinfo_x_c.shape[2], &work_sz_c2r_x));
323327
} else {
324328
CHECK_CUFFT_EXIT(cufftMakePlan1d(cufft_plan_r2c_x, gx, get_cufft_type_r2c(real_t(0)),
325329
pinfo_x_r.shape[1] * pinfo_x_r.shape[2], &work_sz_r2c_x));
@@ -336,15 +340,15 @@ int main(int argc, char** argv) {
336340
if (!no_slab_opt && config.pdims[0] == 1 && config.pdims[1] == 1) {
337341
// single rank, x-y-z slab: use 3D FFT
338342
slab_xyz = true;
339-
CHECK_CUFFT_EXIT(cufftMakePlan3d(cufft_plan_c2c_x, gx, gy, gz, get_cufft_type_c2c(real_t(0)),
340-
&work_sz_c2c_x));
343+
CHECK_CUFFT_EXIT(cufftMakePlan3d(cufft_plan_c2c_x, gx, gy, gz, get_cufft_type_c2c(real_t(0)), &work_sz_c2c_x));
341344
} else if (!no_slab_opt && config.pdims[0] == 1) {
342345
// x-y slab: use 2D FFT
343346
slab_xy = true;
344347
std::array<int, 2> n{gy, gx};
345-
CHECK_CUFFT_EXIT(cufftMakePlanMany(
346-
cufft_plan_c2c_x, 2, n.data(), nullptr, 1, pinfo_x_c.shape[0] * pinfo_x_c.shape[1], nullptr, 1,
347-
pinfo_x_c.shape[0] * pinfo_x_c.shape[1], get_cufft_type_c2c(real_t(0)), pinfo_x_c.shape[2], &work_sz_c2c_x));
348+
CHECK_CUFFT_EXIT(cufftMakePlanMany(cufft_plan_c2c_x, 2, n.data(), nullptr, 1,
349+
pinfo_x_c.shape[0] * pinfo_x_c.shape[1], nullptr, 1,
350+
pinfo_x_c.shape[0] * pinfo_x_c.shape[1], get_cufft_type_c2c(real_t(0)),
351+
pinfo_x_c.shape[2], &work_sz_c2c_x));
348352
} else {
349353
CHECK_CUFFT_EXIT(cufftMakePlan1d(cufft_plan_c2c_x, gx, get_cufft_type_c2c(real_t(0)),
350354
pinfo_x_c.shape[1] * pinfo_x_c.shape[2], &work_sz_c2c_x));
@@ -363,9 +367,10 @@ int main(int argc, char** argv) {
363367
slab_yz = true;
364368
if (axis_contiguous[1]) {
365369
std::array<int, 2> n{gz, gy};
366-
CHECK_CUFFT_EXIT(cufftMakePlanMany(
367-
cufft_plan_c2c_y, 2, n.data(), nullptr, 1, pinfo_y_c.shape[0] * pinfo_y_c.shape[1], nullptr, 1,
368-
pinfo_y_c.shape[0] * pinfo_y_c.shape[1], get_cufft_type_c2c(real_t(0)), pinfo_y_c.shape[2], &work_sz_c2c_y));
370+
CHECK_CUFFT_EXIT(cufftMakePlanMany(cufft_plan_c2c_y, 2, n.data(), nullptr, 1,
371+
pinfo_y_c.shape[0] * pinfo_y_c.shape[1], nullptr, 1,
372+
pinfo_y_c.shape[0] * pinfo_y_c.shape[1], get_cufft_type_c2c(real_t(0)),
373+
pinfo_y_c.shape[2], &work_sz_c2c_y));
369374
} else {
370375
// Note: In this case, both slab dimensions are strided, leading to slower performance using
371376
// 2D FFT. Run 1D + 1D instead.
@@ -508,8 +513,8 @@ int main(int argc, char** argv) {
508513

509514
if (!slab_xyz) {
510515
CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc_c, input, output, work_c_d,
511-
get_cudecomp_datatype(complex_t(0)), nullptr, nullptr,
512-
nullptr, nullptr, 0));
516+
get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, nullptr, nullptr,
517+
0));
513518
}
514519

515520
if (!slab_xy && !slab_xyz) {
@@ -531,8 +536,8 @@ int main(int argc, char** argv) {
531536
// For y-z slab case, no need to perform yz transposes or z-axis FFT
532537
if (!slab_yz && !slab_xyz) {
533538
CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_desc_c, input, output, work_c_d,
534-
get_cudecomp_datatype(complex_t(0)), nullptr, nullptr,
535-
nullptr, nullptr, 0));
539+
get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, nullptr, nullptr,
540+
0));
536541
}
537542

538543
if (!slab_yz && !slab_xyz) {
@@ -547,8 +552,8 @@ int main(int argc, char** argv) {
547552

548553
if (!slab_yz && !slab_xyz) {
549554
CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_desc_c, input, output, work_c_d,
550-
get_cudecomp_datatype(complex_t(0)), nullptr, nullptr,
551-
nullptr, nullptr, 0));
555+
get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, nullptr, nullptr,
556+
0));
552557
}
553558

554559
if (!slab_xy && !slab_xyz) {
@@ -569,8 +574,8 @@ int main(int argc, char** argv) {
569574

570575
if (!slab_xyz) {
571576
CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc_c, input, output, work_c_d,
572-
get_cudecomp_datatype(complex_t(0)), nullptr, nullptr,
573-
nullptr, nullptr, 0));
577+
get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, nullptr, nullptr,
578+
0));
574579
}
575580
#ifdef R2C
576581
CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2r_x, output, output_r, CUFFT_INVERSE));
@@ -650,7 +655,9 @@ int main(int argc, char** argv) {
650655
std::sort(trial_times.begin(), trial_times.end());
651656
double flopcount = 5.0 * fftsize * std::log(static_cast<double>(fftsize)) * 1e-9 / std::log(2.0);
652657
std::vector<double> trial_flops(ntrials);
653-
for (int i = 0; i < ntrials; ++i) { trial_flops[i] = flopcount / trial_times[i]; }
658+
for (int i = 0; i < ntrials; ++i) {
659+
trial_flops[i] = flopcount / trial_times[i];
660+
}
654661

655662
auto times = process_timings(trial_times, 1000.);
656663
auto flops = process_timings(trial_flops);

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
author = 'NVIDIA Corporation'
2525

2626
# The full version, including alpha/beta/rc tags
27-
version = '0.4.0'
27+
version = '0.5.0'
2828
release = version
2929

3030

examples/cc/basic_usage/basic_usage.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -266,12 +266,12 @@ int main(int argc, char** argv) {
266266
pinfo_x.halo_extents, nullptr, nullptr, nullptr, 0));
267267

268268
// Transpose from Y-pencils to Z-pencils.
269-
CHECK_CUDECOMP_EXIT(
270-
cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0));
269+
CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE,
270+
nullptr, nullptr, nullptr, nullptr, 0));
271271

272272
// Transpose from Z-pencils to Y-pencils.
273-
CHECK_CUDECOMP_EXIT(
274-
cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0));
273+
CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE,
274+
nullptr, nullptr, nullptr, nullptr, 0));
275275

276276
// Transpose from Y-pencils to X-pencils.
277277
CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE,

examples/cc/basic_usage/basic_usage_autotune.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -156,11 +156,11 @@ int main(int argc, char** argv) {
156156
options.transpose_use_inplace_buffers[1] = true; // use in-place buffers for Y-to-Z transpose
157157
options.transpose_use_inplace_buffers[2] = true; // use in-place buffers for Z-to-Y transpose
158158
options.transpose_use_inplace_buffers[3] = true; // use in-place buffers for Y-to-X transpose
159-
options.transpose_op_weights[0] = 1.0; // apply 1.0 multiplier to X-to-Y transpose timings
160-
options.transpose_op_weights[1] = 1.0; // apply 1.0 multiplier to Y-to-Z transpose timings
161-
options.transpose_op_weights[2] = 1.0; // apply 1.0 multiplier to Z-to-Y transpose timings
162-
options.transpose_op_weights[3] = 1.0; // apply 1.0 multiplier to Y-to-X transpose timings
163-
options.transpose_input_halo_extents[0][0] = 1; // set input_halo_extent to [1, 1, 1] for X-to-Y transpose
159+
options.transpose_op_weights[0] = 1.0; // apply 1.0 multiplier to X-to-Y transpose timings
160+
options.transpose_op_weights[1] = 1.0; // apply 1.0 multiplier to Y-to-Z transpose timings
161+
options.transpose_op_weights[2] = 1.0; // apply 1.0 multiplier to Z-to-Y transpose timings
162+
options.transpose_op_weights[3] = 1.0; // apply 1.0 multiplier to Y-to-X transpose timings
163+
options.transpose_input_halo_extents[0][0] = 1; // set input_halo_extent to [1, 1, 1] for X-to-Y transpose
164164
options.transpose_input_halo_extents[0][1] = 1;
165165
options.transpose_input_halo_extents[0][2] = 1;
166166
options.transpose_output_halo_extents[3][0] = 1; // set output_halo_extent to [1, 1, 1] for Y-to-X transpose
@@ -250,12 +250,12 @@ int main(int argc, char** argv) {
250250
pinfo_x.halo_extents, nullptr, nullptr, nullptr, 0));
251251

252252
// Transpose from Y-pencils to Z-pencils.
253-
CHECK_CUDECOMP_EXIT(
254-
cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0));
253+
CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE,
254+
nullptr, nullptr, nullptr, nullptr, 0));
255255

256256
// Transpose from Z-pencils to Y-pencils.
257-
CHECK_CUDECOMP_EXIT(
258-
cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0));
257+
CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE,
258+
nullptr, nullptr, nullptr, nullptr, 0));
259259

260260
// Transpose from Y-pencils to X-pencils.
261261
CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE,

0 commit comments

Comments
 (0)