@@ -57,7 +57,9 @@ template <typename T> static std::vector<T> process_timings(std::vector<T> times
57
57
CHECK_MPI_EXIT (MPI_Comm_size (MPI_COMM_WORLD, &nranks));
58
58
t_avg /= nranks;
59
59
60
- for (auto & t : times) { t = (t - t_avg) * (t - t_avg); }
60
+ for (auto & t : times) {
61
+ t = (t - t_avg) * (t - t_avg);
62
+ }
61
63
double t_var = std::accumulate (times.begin (), times.end (), T (0 )) / times.size ();
62
64
CHECK_MPI_EXIT (MPI_Allreduce (MPI_IN_PLACE, &t_var, 1 , MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD));
63
65
t_var /= nranks;
@@ -222,7 +224,9 @@ int main(int argc, char** argv) {
222
224
cudecompGridDescAutotuneOptions_t options;
223
225
CHECK_CUDECOMP_EXIT (cudecompGridDescAutotuneOptionsSetDefaults (&options));
224
226
options.dtype = get_cudecomp_datatype (complex_t (0 ));
225
- for (int i = 0 ; i < 4 ; ++i) { options.transpose_use_inplace_buffers [i] = !out_of_place; }
227
+ for (int i = 0 ; i < 4 ; ++i) {
228
+ options.transpose_use_inplace_buffers [i] = !out_of_place;
229
+ }
226
230
227
231
if (comm_backend != 0 ) {
228
232
config.transpose_comm_backend = comm_backend;
@@ -306,20 +310,20 @@ int main(int argc, char** argv) {
306
310
if (!no_slab_opt && config.pdims [0 ] == 1 && config.pdims [1 ] == 1 ) {
307
311
// single rank, x-y-z slab: use 3D FFT
308
312
slab_xyz = true ;
309
- CHECK_CUFFT_EXIT (cufftMakePlan3d (cufft_plan_r2c_x, gx, gy, gz, get_cufft_type_r2c (real_t (0 )),
310
- &work_sz_r2c_x));
311
- CHECK_CUFFT_EXIT (cufftMakePlan3d (cufft_plan_c2r_x, gx, gy, gz, get_cufft_type_c2r (real_t (0 )),
312
- &work_sz_c2r_x));
313
+ CHECK_CUFFT_EXIT (cufftMakePlan3d (cufft_plan_r2c_x, gx, gy, gz, get_cufft_type_r2c (real_t (0 )), &work_sz_r2c_x));
314
+ CHECK_CUFFT_EXIT (cufftMakePlan3d (cufft_plan_c2r_x, gx, gy, gz, get_cufft_type_c2r (real_t (0 )), &work_sz_c2r_x));
313
315
} else if (!no_slab_opt && config.pdims [0 ] == 1 ) {
314
316
// x-y slab: use 2D FFT
315
317
slab_xy = true ;
316
318
std::array<int , 2 > n{gx, gy};
317
- CHECK_CUFFT_EXIT (cufftMakePlanMany (
318
- cufft_plan_r2c_x, 2 , n.data (), nullptr , 1 , pinfo_x_r.shape [0 ] * pinfo_x_r.shape [1 ], nullptr , 1 ,
319
- pinfo_x_c.shape [0 ] * pinfo_x_c.shape [1 ], get_cufft_type_r2c (real_t (0 )), pinfo_x_r.shape [2 ], &work_sz_r2c_x));
320
- CHECK_CUFFT_EXIT (cufftMakePlanMany (
321
- cufft_plan_c2r_x, 2 , n.data (), nullptr , 1 , pinfo_x_c.shape [0 ] * pinfo_x_c.shape [1 ], nullptr , 1 ,
322
- pinfo_x_r.shape [0 ] * pinfo_x_r.shape [1 ], get_cufft_type_c2r (real_t (0 )), pinfo_x_c.shape [2 ], &work_sz_c2r_x));
319
+ CHECK_CUFFT_EXIT (cufftMakePlanMany (cufft_plan_r2c_x, 2 , n.data (), nullptr , 1 ,
320
+ pinfo_x_r.shape [0 ] * pinfo_x_r.shape [1 ], nullptr , 1 ,
321
+ pinfo_x_c.shape [0 ] * pinfo_x_c.shape [1 ], get_cufft_type_r2c (real_t (0 )),
322
+ pinfo_x_r.shape [2 ], &work_sz_r2c_x));
323
+ CHECK_CUFFT_EXIT (cufftMakePlanMany (cufft_plan_c2r_x, 2 , n.data (), nullptr , 1 ,
324
+ pinfo_x_c.shape [0 ] * pinfo_x_c.shape [1 ], nullptr , 1 ,
325
+ pinfo_x_r.shape [0 ] * pinfo_x_r.shape [1 ], get_cufft_type_c2r (real_t (0 )),
326
+ pinfo_x_c.shape [2 ], &work_sz_c2r_x));
323
327
} else {
324
328
CHECK_CUFFT_EXIT (cufftMakePlan1d (cufft_plan_r2c_x, gx, get_cufft_type_r2c (real_t (0 )),
325
329
pinfo_x_r.shape [1 ] * pinfo_x_r.shape [2 ], &work_sz_r2c_x));
@@ -336,15 +340,15 @@ int main(int argc, char** argv) {
336
340
if (!no_slab_opt && config.pdims [0 ] == 1 && config.pdims [1 ] == 1 ) {
337
341
// single rank, x-y-z slab: use 3D FFT
338
342
slab_xyz = true ;
339
- CHECK_CUFFT_EXIT (cufftMakePlan3d (cufft_plan_c2c_x, gx, gy, gz, get_cufft_type_c2c (real_t (0 )),
340
- &work_sz_c2c_x));
343
+ CHECK_CUFFT_EXIT (cufftMakePlan3d (cufft_plan_c2c_x, gx, gy, gz, get_cufft_type_c2c (real_t (0 )), &work_sz_c2c_x));
341
344
} else if (!no_slab_opt && config.pdims [0 ] == 1 ) {
342
345
// x-y slab: use 2D FFT
343
346
slab_xy = true ;
344
347
std::array<int , 2 > n{gy, gx};
345
- CHECK_CUFFT_EXIT (cufftMakePlanMany (
346
- cufft_plan_c2c_x, 2 , n.data (), nullptr , 1 , pinfo_x_c.shape [0 ] * pinfo_x_c.shape [1 ], nullptr , 1 ,
347
- pinfo_x_c.shape [0 ] * pinfo_x_c.shape [1 ], get_cufft_type_c2c (real_t (0 )), pinfo_x_c.shape [2 ], &work_sz_c2c_x));
348
+ CHECK_CUFFT_EXIT (cufftMakePlanMany (cufft_plan_c2c_x, 2 , n.data (), nullptr , 1 ,
349
+ pinfo_x_c.shape [0 ] * pinfo_x_c.shape [1 ], nullptr , 1 ,
350
+ pinfo_x_c.shape [0 ] * pinfo_x_c.shape [1 ], get_cufft_type_c2c (real_t (0 )),
351
+ pinfo_x_c.shape [2 ], &work_sz_c2c_x));
348
352
} else {
349
353
CHECK_CUFFT_EXIT (cufftMakePlan1d (cufft_plan_c2c_x, gx, get_cufft_type_c2c (real_t (0 )),
350
354
pinfo_x_c.shape [1 ] * pinfo_x_c.shape [2 ], &work_sz_c2c_x));
@@ -363,9 +367,10 @@ int main(int argc, char** argv) {
363
367
slab_yz = true ;
364
368
if (axis_contiguous[1 ]) {
365
369
std::array<int , 2 > n{gz, gy};
366
- CHECK_CUFFT_EXIT (cufftMakePlanMany (
367
- cufft_plan_c2c_y, 2 , n.data (), nullptr , 1 , pinfo_y_c.shape [0 ] * pinfo_y_c.shape [1 ], nullptr , 1 ,
368
- pinfo_y_c.shape [0 ] * pinfo_y_c.shape [1 ], get_cufft_type_c2c (real_t (0 )), pinfo_y_c.shape [2 ], &work_sz_c2c_y));
370
+ CHECK_CUFFT_EXIT (cufftMakePlanMany (cufft_plan_c2c_y, 2 , n.data (), nullptr , 1 ,
371
+ pinfo_y_c.shape [0 ] * pinfo_y_c.shape [1 ], nullptr , 1 ,
372
+ pinfo_y_c.shape [0 ] * pinfo_y_c.shape [1 ], get_cufft_type_c2c (real_t (0 )),
373
+ pinfo_y_c.shape [2 ], &work_sz_c2c_y));
369
374
} else {
370
375
// Note: In this case, both slab dimensions are strided, leading to slower performance using
371
376
// 2D FFT. Run 1D + 1D instead.
@@ -508,8 +513,8 @@ int main(int argc, char** argv) {
508
513
509
514
if (!slab_xyz) {
510
515
CHECK_CUDECOMP_EXIT (cudecompTransposeXToY (handle, grid_desc_c, input, output, work_c_d,
511
- get_cudecomp_datatype (complex_t (0 )), nullptr , nullptr ,
512
- nullptr , nullptr , 0 ));
516
+ get_cudecomp_datatype (complex_t (0 )), nullptr , nullptr , nullptr , nullptr ,
517
+ 0 ));
513
518
}
514
519
515
520
if (!slab_xy && !slab_xyz) {
@@ -531,8 +536,8 @@ int main(int argc, char** argv) {
531
536
// For y-z slab case, no need to perform yz transposes or z-axis FFT
532
537
if (!slab_yz && !slab_xyz) {
533
538
CHECK_CUDECOMP_EXIT (cudecompTransposeYToZ (handle, grid_desc_c, input, output, work_c_d,
534
- get_cudecomp_datatype (complex_t (0 )), nullptr , nullptr ,
535
- nullptr , nullptr , 0 ));
539
+ get_cudecomp_datatype (complex_t (0 )), nullptr , nullptr , nullptr , nullptr ,
540
+ 0 ));
536
541
}
537
542
538
543
if (!slab_yz && !slab_xyz) {
@@ -547,8 +552,8 @@ int main(int argc, char** argv) {
547
552
548
553
if (!slab_yz && !slab_xyz) {
549
554
CHECK_CUDECOMP_EXIT (cudecompTransposeZToY (handle, grid_desc_c, input, output, work_c_d,
550
- get_cudecomp_datatype (complex_t (0 )), nullptr , nullptr ,
551
- nullptr , nullptr , 0 ));
555
+ get_cudecomp_datatype (complex_t (0 )), nullptr , nullptr , nullptr , nullptr ,
556
+ 0 ));
552
557
}
553
558
554
559
if (!slab_xy && !slab_xyz) {
@@ -569,8 +574,8 @@ int main(int argc, char** argv) {
569
574
570
575
if (!slab_xyz) {
571
576
CHECK_CUDECOMP_EXIT (cudecompTransposeYToX (handle, grid_desc_c, input, output, work_c_d,
572
- get_cudecomp_datatype (complex_t (0 )), nullptr , nullptr ,
573
- nullptr , nullptr , 0 ));
577
+ get_cudecomp_datatype (complex_t (0 )), nullptr , nullptr , nullptr , nullptr ,
578
+ 0 ));
574
579
}
575
580
#ifdef R2C
576
581
CHECK_CUFFT_EXIT (cufftXtExec (cufft_plan_c2r_x, output, output_r, CUFFT_INVERSE));
@@ -650,7 +655,9 @@ int main(int argc, char** argv) {
650
655
std::sort (trial_times.begin (), trial_times.end ());
651
656
double flopcount = 5.0 * fftsize * std::log (static_cast <double >(fftsize)) * 1e-9 / std::log (2.0 );
652
657
std::vector<double > trial_flops (ntrials);
653
- for (int i = 0 ; i < ntrials; ++i) { trial_flops[i] = flopcount / trial_times[i]; }
658
+ for (int i = 0 ; i < ntrials; ++i) {
659
+ trial_flops[i] = flopcount / trial_times[i];
660
+ }
654
661
655
662
auto times = process_timings (trial_times, 1000 .);
656
663
auto flops = process_timings (trial_flops);
0 commit comments