From b90eca05a41e7a6f6b9633773f7e314408137c24 Mon Sep 17 00:00:00 2001 From: Trevor Keller Date: Tue, 10 Oct 2017 12:06:47 -0400 Subject: [PATCH 1/4] add blocksize to params --- Makefile | 2 + common-diffusion/output.c | 69 ++++++++++-------- common-diffusion/output.h | 2 +- cpu-analytic-diffusion/analytic_main.c | 4 +- cpu-openmp-diffusion/openmp_main.c | 4 +- cpu-serial-diffusion/serial_main.c | 4 +- cpu-tbb-diffusion/tbb_main.c | 4 +- gpu-cuda-diffusion/cuda_data.h | 5 +- gpu-cuda-diffusion/cuda_discretization.cu | 85 ++++++++++++----------- gpu-cuda-diffusion/cuda_kernels.cuh | 10 --- gpu-cuda-diffusion/cuda_main.c | 6 +- gpu-openacc-diffusion/openacc_main.c | 4 +- gpu-opencl-diffusion/opencl_main.c | 4 +- phi-openmp-diffusion/phi_main.c | 4 +- 14 files changed, 110 insertions(+), 97 deletions(-) diff --git a/Makefile b/Makefile index 466f148..b6abd02 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,8 @@ gpu_diffusion_list := gpu-cuda-diffusion \ gpu-openacc-diffusion \ gpu-opencl-diffusion +phi_diffusion_list := phi-openmp-diffusion + .PHONY: run run: run_cpu_diffusion run_gpu_diffusion diff --git a/common-diffusion/output.c b/common-diffusion/output.c index b14cdf5..3f4204f 100644 --- a/common-diffusion/output.c +++ b/common-diffusion/output.c @@ -29,12 +29,13 @@ #include #include "output.h" -void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, fp_t* dx, fp_t* dy, fp_t* D, fp_t* linStab, int* steps, int* checks) +/* void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, fp_t* dx, fp_t* dy, fp_t* D, fp_t* linStab, int* steps, int* checks) */ +void param_parser(int argc, char* argv[], int* bx, int* by, int* checks, int* code, fp_t* D, fp_t* dx, fp_t* dy, fp_t* linStab, int* nm, int* nx, int* ny, int* steps) { FILE * input; char buffer[256]; char* pch; - int inx=0, iny=0, idx=0, idy=0, ins=0, inc=0, idc=0, ico=0, isc=0; + int ibx=0, iby=0, ico=0, idc=0, idx=0, idy=0, inc=0, ins=0, inx=0, iny=0, isc=0; if (argc != 2) { printf("Error: improper arguments supplied.\nUsage: ./%s filename\n", argv[0]); @@ -53,14 +54,22 @@ void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, { pch = strtok(buffer, " "); - if (strcmp(pch, "nx") == 0) { + if (strcmp(pch, "bx") == 0) { pch = strtok(NULL, " "); - *nx = atoi(pch); - inx = 1; - } else if (strcmp(pch, "ny") == 0) { + *bx = atoi(pch); + ibx = 1; + } else if (strcmp(pch, "by") == 0) { pch = strtok(NULL, " "); - *ny = atoi(pch); - iny = 1; + *by = atoi(pch); + iby = 1; + } else if (strcmp(pch, "co") == 0) { + pch = strtok(NULL, " "); + *linStab = atof(pch); + ico = 1; + } else if (strcmp(pch, "dc") == 0) { + pch = strtok(NULL, " "); + *D = atof(pch); + idc = 1; } else if (strcmp(pch, "dx") == 0) { pch = strtok(NULL, " "); *dx = atof(pch); @@ -69,22 +78,22 @@ void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, pch = strtok(NULL, " "); *dy = atof(pch); idy = 1; - } else if (strcmp(pch, "ns") == 0) { - pch = strtok(NULL, " "); - *steps = atoi(pch); - ins = 1; } else if (strcmp(pch, "nc") == 0) { pch = strtok(NULL, " "); *checks = atoi(pch); inc = 1; - } else if (strcmp(pch, "dc") == 0) { + } else if (strcmp(pch, "ns") == 0) { pch = strtok(NULL, " "); - *D = atof(pch); - idc = 1; - } else if (strcmp(pch, "co") == 0) { + *steps = atoi(pch); + ins = 1; + } else if (strcmp(pch, "nx") == 0) { pch = strtok(NULL, " "); - *linStab = atof(pch); - ico = 1; + *nx = atoi(pch); + inx = 1; + } else if (strcmp(pch, "ny") == 0) { + pch = strtok(NULL, " "); + *ny = atoi(pch); + iny = 1; } else if (strcmp(pch, "sc") == 0) { pch = strtok(NULL, " "); *nm = atoi(pch); @@ -98,22 +107,26 @@ void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, } /* make sure we got everyone */ - if (! inx) { - printf("Warning: parameter %s undefined. Using default value, %i.\n", "nx", *nx); - } else if (! iny) { - printf("Warning: parameter %s undefined. Using default value, %i.\n", "ny", *ny); + if (! ibx) { + printf("Warning: parameter %s undefined. Using default value, %i.\n", "bx", *bx); + } else if (! iby) { + printf("Warning: parameter %s undefined. Using default value, %i.\n", "by", *by); + } else if (! ico) { + printf("Warning: parameter %s undefined. Using default value, %f.\n", "co", *linStab); + } else if (! idc) { + printf("Warning: parameter %s undefined. Using default value, %f.\n", "dc", *D); } else if (! idx) { printf("Warning: parameter %s undefined. Using default value, %f.\n", "dx", *dx); } else if (! idy) { printf("Warning: parameter %s undefined. Using default value, %f.\n", "dy", *dy); - } else if (! ins) { - printf("Warning: parameter %s undefined. Using default value, %i.\n", "ns", *steps); } else if (! inc) { printf("Warning: parameter %s undefined. Using default value, %i.\n", "nc", *checks); - } else if (! idc) { - printf("Warning: parameter %s undefined. Using default value, %f.\n", "dc", *D); - } else if (! ico) { - printf("Warning: parameter %s undefined. Using default value, %f.\n", "co", *linStab); + } else if (! ins) { + printf("Warning: parameter %s undefined. Using default value, %i.\n", "ns", *steps); + } else if (! inx) { + printf("Warning: parameter %s undefined. Using default value, %i.\n", "nx", *nx); + } else if (! iny) { + printf("Warning: parameter %s undefined. Using default value, %i.\n", "ny", *ny); } else if (! isc) { printf("Warning: parameter %s undefined. Using default values, %i and %i.\n", "sc", *nm, *code); } diff --git a/common-diffusion/output.h b/common-diffusion/output.h index 38c171a..9643390 100644 --- a/common-diffusion/output.h +++ b/common-diffusion/output.h @@ -32,7 +32,7 @@ /** \brief Read parameters from file specified on the command line */ -void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, fp_t* dx, fp_t* dy, fp_t* D, fp_t* linStab, int* steps, int* checks); +void param_parser(int argc, char* argv[], int* bx, int* by, int* checks, int* code, fp_t* D, fp_t* dx, fp_t* dy, fp_t* linStab, int* nm, int* nx, int* ny, int* steps); /** \brief Prints timestamps and a 20-point progress bar to stdout diff --git a/cpu-analytic-diffusion/analytic_main.c b/cpu-analytic-diffusion/analytic_main.c index 3f4e8a9..eb8969e 100644 --- a/cpu-analytic-diffusion/analytic_main.c +++ b/cpu-analytic-diffusion/analytic_main.c @@ -72,7 +72,7 @@ int main(int argc, char* argv[]) /* declare default mesh size and resolution */ fp_t **conc_old, **conc_new, **conc_lap, **mask_lap; - int nx=512, ny=512, nm=3, code=53; + int bx=32, by=32, nx=512, ny=512, nm=3, code=53; fp_t dx=0.5, dy=0.5, h=0.5; /* declare default materials and numerical parameters */ @@ -82,7 +82,7 @@ int main(int argc, char* argv[]) StartTimer(); - param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks); + param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps); h = (dx > dy) ? dy : dx; dt = (linStab * h * h) / (4.0 * D); diff --git a/cpu-openmp-diffusion/openmp_main.c b/cpu-openmp-diffusion/openmp_main.c index d3a284a..4b4a5d1 100644 --- a/cpu-openmp-diffusion/openmp_main.c +++ b/cpu-openmp-diffusion/openmp_main.c @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) /* declare default mesh size and resolution */ fp_t **conc_old, **conc_new, **conc_lap, **mask_lap; - int nx=512, ny=512, nm=3, code=53; + int bx=32, by=32, nx=512, ny=512, nm=3, code=53; fp_t dx=0.5, dy=0.5, h=0.5; fp_t bc[2][2]; @@ -55,7 +55,7 @@ int main(int argc, char* argv[]) StartTimer(); - param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks); + param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps); h = (dx > dy) ? dy : dx; dt = (linStab * h * h) / (4.0 * D); diff --git a/cpu-serial-diffusion/serial_main.c b/cpu-serial-diffusion/serial_main.c index c0a705f..da8a752 100644 --- a/cpu-serial-diffusion/serial_main.c +++ b/cpu-serial-diffusion/serial_main.c @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) /* declare default mesh size and resolution */ fp_t **conc_old, **conc_new, **conc_lap, **mask_lap; - int nx=512, ny=512, nm=3, code=53; + int bx=32, by=32, nx=512, ny=512, nm=3, code=53; fp_t dx=0.5, dy=0.5, h=0.5; fp_t bc[2][2]; @@ -63,7 +63,7 @@ int main(int argc, char* argv[]) StartTimer(); - param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks); + param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps); h = (dx > dy) ? dy : dx; dt = (linStab * h * h) / (4.0 * D); diff --git a/cpu-tbb-diffusion/tbb_main.c b/cpu-tbb-diffusion/tbb_main.c index 33539c0..10bc7b3 100644 --- a/cpu-tbb-diffusion/tbb_main.c +++ b/cpu-tbb-diffusion/tbb_main.c @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) /* declare default mesh size and resolution */ fp_t **conc_old, **conc_new, **conc_lap, **mask_lap; - int nx=512, ny=512, nm=3, code=53; + int bx=32, by=32, nx=512, ny=512, nm=3, code=53; fp_t dx=0.5, dy=0.5, h=0.5; fp_t bc[2][2]; @@ -55,7 +55,7 @@ int main(int argc, char* argv[]) StartTimer(); - param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks); + param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps); h = (dx > dy) ? dy : dx; dt = (linStab * h * h) / (4.0 * D); diff --git a/gpu-cuda-diffusion/cuda_data.h b/gpu-cuda-diffusion/cuda_data.h index 4f2159f..475c098 100644 --- a/gpu-cuda-diffusion/cuda_data.h +++ b/gpu-cuda-diffusion/cuda_data.h @@ -48,8 +48,9 @@ void init_cuda(fp_t** conc_old, fp_t** mask_lap, fp_t bc[2][2], \brief Specialization of solve_diffusion_equation() using CUDA */ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new, - int nx, int ny, int nm, - fp_t bc[2][2], fp_t D, fp_t dt, int checks, + fp_t bc[2][2], int bx, int by, + int nm, int nx, int ny, + fp_t D, fp_t dt, int checks, fp_t *elapsed, struct Stopwatch* sw); /** diff --git a/gpu-cuda-diffusion/cuda_discretization.cu b/gpu-cuda-diffusion/cuda_discretization.cu index 36651ca..c04dd94 100644 --- a/gpu-cuda-diffusion/cuda_discretization.cu +++ b/gpu-cuda-diffusion/cuda_discretization.cu @@ -46,53 +46,56 @@ __global__ void convolution_kernel(fp_t* d_conc_old, const int ny, const int nm) { - int i, j, tx, ty; - int dst_row, dst_col, dst_cols, dst_rows; - int src_row, src_col, src_cols, src_rows; + int i, j, y; + int dst_til_y, dst_til_x, dst_til_nx, dst_til_ny; + int src_til_y, src_til_x, src_til_nx, src_til_ny; + int sha_til_x, sha_til_y, sha_til_nx; fp_t value=0.; - /* source tile width includes the halo cells */ - src_cols = blockDim.x; - src_rows = blockDim.y; + /* source and shared tile width include the halo cells */ + src_til_nx = blockDim.x; + src_til_ny = blockDim.y; + sha_til_nx = src_til_nx; /* destination tile width excludes the halo cells */ - dst_cols = src_cols - nm + 1; - dst_rows = src_rows - nm + 1; + dst_til_nx = src_til_nx - nm + 1; + dst_til_ny = src_til_ny - nm + 1; /* determine indices on which to operate */ - tx = threadIdx.x; - ty = threadIdx.y; + sha_til_x = threadIdx.x; + sha_til_y = threadIdx.y; - dst_col = blockIdx.x * dst_cols + tx; - dst_row = blockIdx.y * dst_rows + ty; + dst_til_x = blockIdx.x * dst_til_nx + sha_til_x; + dst_til_y = blockIdx.y * dst_til_ny + sha_til_y; - src_col = dst_col - nm/2; - src_row = dst_row - nm/2; + src_til_x = dst_til_x - nm/2; + src_til_y = dst_til_y - nm/2; /* copy tile: __shared__ gives access to all threads working on this tile */ - __shared__ fp_t d_conc_tile[TILE_H + MAX_MASK_H - 1][TILE_W + MAX_MASK_W - 1]; + extern __shared__ fp_t d_conc_tile[]; - if (src_row >= 0 && src_row < ny && - src_col >= 0 && src_col < nx) { - /* if src_row==0, then dst_row==nm/2: this is a halo row */ - d_conc_tile[ty][tx] = d_conc_old[src_row * nx + src_col]; + if (src_til_y >= 0 && src_til_y < ny && + src_til_x >= 0 && src_til_x < nx) { + /* if src_til_y==0, then dst_til_y==nm/2: this is a halo row */ + d_conc_tile[sha_til_y * sha_til_nx + sha_til_x] = d_conc_old[src_til_y * nx + src_til_x]; } else { - d_conc_tile[ty][tx] = 0.; + d_conc_tile[sha_til_y * sha_til_nx + sha_til_x] = 0.; } /* tile data is shared: wait for all threads to finish copying */ __syncthreads(); /* compute the convolution */ - if (tx < dst_cols && ty < dst_rows) { + if (sha_til_x < dst_til_nx && sha_til_y < dst_til_ny) { for (j = 0; j < nm; j++) { + y = (j+sha_til_y) * sha_til_nx; for (i = 0; i < nm; i++) { - value += d_mask[j * nm + i] * d_conc_tile[j+ty][i+tx]; + value += d_mask[j * nm + i] * d_conc_tile[y + i + sha_til_x]; } } /* record value */ - if (dst_row < ny && dst_col < nx) { - d_conc_lap[dst_row * nx + dst_col] = value; + if (dst_til_y < ny && dst_til_x < nx) { + d_conc_lap[dst_til_y * nx + dst_til_x] = value; } } @@ -109,19 +112,19 @@ __global__ void diffusion_kernel(fp_t* d_conc_old, const fp_t D, const fp_t dt) { - int tx, ty, row, col; + int thr_x, thr_y, x, y; /* determine indices on which to operate */ - tx = threadIdx.x; - ty = threadIdx.y; + thr_x = threadIdx.x; + thr_y = threadIdx.y; - col = blockDim.x * blockIdx.x + tx; - row = blockDim.y * blockIdx.y + ty; + x = blockDim.x * blockIdx.x + thr_x; + y = blockDim.y * blockIdx.y + thr_y; /* explicit Euler solution to the equation of motion */ - if (row < ny && col < nx) { - d_conc_new[row * nx + col] = d_conc_old[row * nx + col] - + dt * D * d_conc_lap[row * nx + col]; + if (x < nx && y < ny) { + d_conc_new[y * nx + x] = d_conc_old[y * nx + x] + + dt * D * d_conc_lap[y * nx + x]; } /* wait for all threads to finish writing */ @@ -129,7 +132,8 @@ __global__ void diffusion_kernel(fp_t* d_conc_old, } void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new, - int nx, int ny, int nm, fp_t bc[2][2], + fp_t bc[2][2], int bx, int by, + int nm, int nx, int ny, fp_t D, fp_t dt, int checks, fp_t* elapsed, struct Stopwatch* sw) { @@ -138,9 +142,9 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new, /* divide matrices into blocks of (TILE_W x TILE_H) threads */ - /* extremely precarious */ - dim3 tile_dim(TILE_W - nm + 1, TILE_H - nm + 1, 1); + dim3 tile_dim(bx - nm + 1, by - nm + 1, 1); dim3 bloc_dim(nm - 1 + floor(float(nx)/tile_dim.x), nm - 1 + floor(float(ny)/tile_dim.y), 1); + size_t tile_size = bx * by; for (check = 0; check < checks; check++) { /* apply boundary conditions */ @@ -150,7 +154,7 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new, /* compute Laplacian */ start_time = GetTimer(); - convolution_kernel<<>> ( + convolution_kernel<<>> ( dev->conc_old, dev->conc_lap, nx, ny, nm ); sw->conv += GetTimer() - start_time; @@ -224,7 +228,9 @@ void check_solution(fp_t** conc_new, fp_t** conc_lap, int nx, int ny, } void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, - int nx, int ny, int nm) + int bx, int by, + int nm, + int nx, int ny) { /* If you must compute the convolution separately, do so here. * It is strongly recommended that you roll CUDA tasks into one function: @@ -240,8 +246,9 @@ void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, /* divide matrices into blocks of (TILE_W x TILE_W) threads */ /* dim3 tile_dim(TILE_W - nm/2, TILE_H - nm/2, 1); */ /* dim3 bloc_dim(ceil(fp_t(nx)/tile_dim.x)+1, ceil(fp_t(ny)/tile_dim.y)+1, 1); */ - dim3 tile_dim(TILE_W - nm +1, TILE_H - nm + 1, 1); + dim3 tile_dim(bx - nm +1, by - nm + 1, 1); dim3 bloc_dim(nm - 1 + floor(float(nx)/tile_dim.x), nm - 1 + floor(float(ny)/tile_dim.y), 1); + size_t tile_size = bx * by; /* transfer mask in to constant device memory */ cudaMemcpyToSymbol(d_mask, mask_lap[0], nm * nm * sizeof(fp_t)); @@ -251,7 +258,7 @@ void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, cudaMemcpyHostToDevice); /* compute Laplacian */ - convolution_kernel<<>> ( + convolution_kernel<<>> ( d_conc_old, d_conc_lap, nx, ny, nm ); diff --git a/gpu-cuda-diffusion/cuda_kernels.cuh b/gpu-cuda-diffusion/cuda_kernels.cuh index a1539f1..7f07e55 100644 --- a/gpu-cuda-diffusion/cuda_kernels.cuh +++ b/gpu-cuda-diffusion/cuda_kernels.cuh @@ -31,16 +31,6 @@ extern "C" { #include "numerics.h" } -/** - \brief Width of an input tile, including halo cells, for GPU memory allocation -*/ -#define TILE_W 32 - -/** - \brief Height of an input tile, including halo cells, for GPU memory allocation -*/ -#define TILE_H 32 - /** \brief Convolution mask array on the GPU, allocated in protected memory */ diff --git a/gpu-cuda-diffusion/cuda_main.c b/gpu-cuda-diffusion/cuda_main.c index 77304ef..a833eee 100644 --- a/gpu-cuda-diffusion/cuda_main.c +++ b/gpu-cuda-diffusion/cuda_main.c @@ -48,7 +48,7 @@ int main(int argc, char* argv[]) /* declare default mesh size and resolution */ fp_t **conc_old, **conc_new, **conc_lap, **mask_lap; - int nx=512, ny=512, nm=3, code=53; + int bx=32, by=32, nx=512, ny=512, nm=3, code=53; fp_t dx=0.5, dy=0.5, h=0.5; fp_t bc[2][2]; @@ -60,7 +60,7 @@ int main(int argc, char* argv[]) StartTimer(); - param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks); + param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps); h = (dx > dy) ? dy : dx; dt = (linStab * h * h) / (4.0 * D); @@ -109,7 +109,7 @@ int main(int argc, char* argv[]) assert(step + checks <= steps); - cuda_diffusion_solver(&dev, conc_new, nx, ny, nm, bc, D, dt, checks, &elapsed, &sw); + cuda_diffusion_solver(&dev, conc_new, bc, bx, by, nm, nx, ny, D, dt, checks, &elapsed, &sw); for (i = 0; i < checks; i++) { step++; diff --git a/gpu-openacc-diffusion/openacc_main.c b/gpu-openacc-diffusion/openacc_main.c index 8de3269..7122980 100644 --- a/gpu-openacc-diffusion/openacc_main.c +++ b/gpu-openacc-diffusion/openacc_main.c @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) /* declare default mesh size and resolution */ fp_t **conc_old, **conc_new, **conc_lap, **mask_lap; - int nx=512, ny=512, nm=3, code=53; + int bx=32, by=32, nx=512, ny=512, nm=3, code=53; fp_t dx=0.5, dy=0.5, h=0.5; fp_t bc[2][2]; @@ -63,7 +63,7 @@ int main(int argc, char* argv[]) StartTimer(); - param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks); + param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps); h = (dx > dy) ? dy : dx; dt = (linStab * h * h) / (4.0 * D); diff --git a/gpu-opencl-diffusion/opencl_main.c b/gpu-opencl-diffusion/opencl_main.c index 06131af..245fa8f 100644 --- a/gpu-opencl-diffusion/opencl_main.c +++ b/gpu-opencl-diffusion/opencl_main.c @@ -49,7 +49,7 @@ int main(int argc, char* argv[]) /* declare default mesh size and resolution */ fp_t **conc_old, **conc_new, **conc_lap, **mask_lap; - int nx=512, ny=512, nm=3, code=53; + int bx=32, by=32, nx=512, ny=512, nm=3, code=53; fp_t dx=0.5, dy=0.5, h=0.5; fp_t bc[2][2]; @@ -61,7 +61,7 @@ int main(int argc, char* argv[]) StartTimer(); - param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks); + param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps); h = (dx > dy) ? dy : dx; dt = (linStab * h * h) / (4.0 * D); diff --git a/phi-openmp-diffusion/phi_main.c b/phi-openmp-diffusion/phi_main.c index 1835ef8..fd99a42 100644 --- a/phi-openmp-diffusion/phi_main.c +++ b/phi-openmp-diffusion/phi_main.c @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) /* declare default mesh size and resolution */ fp_t **conc_old, **conc_new, **conc_lap, **mask_lap; - int nx=512, ny=512, nm=3, code=53; + int bx=32, by=32, nx=512, ny=512, nm=3, code=53; fp_t dx=0.5, dy=0.5, h=0.5; fp_t bc[2][2]; @@ -55,7 +55,7 @@ int main(int argc, char* argv[]) StartTimer(); - param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks); + param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps); h = (dx > dy) ? dy : dx; dt = (linStab * h * h) / (4.0 * D); From 70437d2c202acc2441c0e9d5fe29b2e8cff807e0 Mon Sep 17 00:00:00 2001 From: Trevor Keller Date: Mon, 16 Oct 2017 17:58:44 -0400 Subject: [PATCH 2/4] Fix root cause of tiled algorithm bugs (block size spec) Closes #96, closes #118. --- common-diffusion/timer.c | 3 ++ gpu-cuda-diffusion/cuda_discretization.cu | 36 ++++++++++++-------- gpu-opencl-diffusion/opencl_discretization.c | 18 +++++----- 3 files changed, 32 insertions(+), 25 deletions(-) diff --git a/common-diffusion/timer.c b/common-diffusion/timer.c index 978af5d..1253307 100644 --- a/common-diffusion/timer.c +++ b/common-diffusion/timer.c @@ -36,6 +36,9 @@ #ifndef __USE_BSD #define __USE_BSD #endif + #ifndef __USE_MISC + #define __USE_MISC + #endif #include #endif diff --git a/gpu-cuda-diffusion/cuda_discretization.cu b/gpu-cuda-diffusion/cuda_discretization.cu index 36651ca..8221575 100644 --- a/gpu-cuda-diffusion/cuda_discretization.cu +++ b/gpu-cuda-diffusion/cuda_discretization.cu @@ -136,37 +136,40 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new, double start_time; int check=0; - /* divide matrices into blocks of (TILE_W x TILE_H) threads */ - - /* extremely precarious */ - dim3 tile_dim(TILE_W - nm + 1, TILE_H - nm + 1, 1); - dim3 bloc_dim(nm - 1 + floor(float(nx)/tile_dim.x), nm - 1 + floor(float(ny)/tile_dim.y), 1); + /* divide matrices into blocks of TILE_W *TILE_H threads */ + dim3 tile_size(TILE_W, + TILE_H, + 1); + dim3 num_tiles(ceil(float(nx) / (tile_size.x - nm + 1)), + ceil(float(ny) / (tile_size.y - nm + 1)), + 1); for (check = 0; check < checks; check++) { /* apply boundary conditions */ - boundary_kernel<<>> ( + boundary_kernel<<>> ( dev->conc_old, nx, ny, nm ); /* compute Laplacian */ start_time = GetTimer(); - convolution_kernel<<>> ( + convolution_kernel<<>> ( dev->conc_old, dev->conc_lap, nx, ny, nm ); sw->conv += GetTimer() - start_time; /* compute result */ start_time = GetTimer(); - diffusion_kernel<<>> ( + diffusion_kernel<<>> ( dev->conc_old, dev->conc_new, dev->conc_lap, nx, ny, nm, D, dt ); sw->step += GetTimer() - start_time; swap_pointers_1D(&(dev->conc_old), &(dev->conc_new)); + + *elapsed += dt; } /* after swap, new data is in dev->conc_old */ - *elapsed += dt * checks; /* transfer from device out to host (conc_new) */ start_time = GetTimer(); @@ -232,16 +235,19 @@ void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, */ fp_t* d_conc_old, *d_conc_lap; + const int margin = 0; /* nm - 1; */ /* allocate memory on device */ cudaMalloc((void **) &d_conc_old, nx * ny * sizeof(fp_t)); cudaMalloc((void **) &d_conc_lap, nx * ny * sizeof(fp_t)); - /* divide matrices into blocks of (TILE_W x TILE_W) threads */ - /* dim3 tile_dim(TILE_W - nm/2, TILE_H - nm/2, 1); */ - /* dim3 bloc_dim(ceil(fp_t(nx)/tile_dim.x)+1, ceil(fp_t(ny)/tile_dim.y)+1, 1); */ - dim3 tile_dim(TILE_W - nm +1, TILE_H - nm + 1, 1); - dim3 bloc_dim(nm - 1 + floor(float(nx)/tile_dim.x), nm - 1 + floor(float(ny)/tile_dim.y), 1); + /* divide matrices into blocks of TILE_W * TILE_H threads */ + dim3 tile_size(TILE_W, + TILE_H, + 1); + dim3 num_tiles(ceil(float(nx) / (tile_size.x - nm + 1)), + ceil(float(ny) / (tile_size.y - nm + 1)), + 1); /* transfer mask in to constant device memory */ cudaMemcpyToSymbol(d_mask, mask_lap[0], nm * nm * sizeof(fp_t)); @@ -251,7 +257,7 @@ void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, cudaMemcpyHostToDevice); /* compute Laplacian */ - convolution_kernel<<>> ( + convolution_kernel<<>> ( d_conc_old, d_conc_lap, nx, ny, nm ); diff --git a/gpu-opencl-diffusion/opencl_discretization.c b/gpu-opencl-diffusion/opencl_discretization.c index 6a40a3b..d9cdf65 100644 --- a/gpu-opencl-diffusion/opencl_discretization.c +++ b/gpu-opencl-diffusion/opencl_discretization.c @@ -46,15 +46,12 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new, power of two: 4, 8, 16, 32, etc. OpenCL will make a best-guess optimal block size if you set size_t* tile_dim = NULL. */ - /* - size_t grid_dim[2] = {(size_t)nx, (size_t)ny}; - size_t tile_dim[2] = {TILE_W, TILE_H}; - */ - - /* extremely precarious */ - size_t tile_dim[2] = {TILE_W - nm + 1, TILE_H - nm + 1}; - size_t bloc_dim[2] = {nm - 1 + floor((float)(nx)/tile_dim[0]), nm - 1 + floor((float)(ny)/tile_dim[1])}; - size_t grid_dim[2] = {bloc_dim[0]*tile_dim[0], bloc_dim[1]*tile_dim[1]}; + size_t tile_dim[2] = {TILE_W, + TILE_H}; + size_t bloc_dim[2] = {ceil((float)(nx) / (tile_dim[0] - nm + 1)), + ceil((float)(ny) / (tile_dim[1] - nm + 1))}; + size_t grid_dim[2] = {bloc_dim[0] * tile_dim[0], + bloc_dim[1] * tile_dim[1]}; cl_mem d_conc_old = dev->conc_old; cl_mem d_conc_new = dev->conc_new; @@ -112,9 +109,10 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new, status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->convolution_kernel, 2, NULL, grid_dim, tile_dim, 0, NULL, NULL); status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel, 2, NULL, grid_dim, tile_dim, 0, NULL, NULL); report_error(status, "enqueue kernels"); + + *elapsed += dt; } - *elapsed += dt * checks; /* transfer from device out to host */ start_time = GetTimer(); From b34632b5d560160f964a114b3781abc42f3307b8 Mon Sep 17 00:00:00 2001 From: Trevor Keller Date: Tue, 17 Oct 2017 14:39:49 -0400 Subject: [PATCH 3/4] physical variable names, dynamic tiles in GPU convolution kernels Closes #113, closes #114 --- common-diffusion/output.c | 1 - gpu-cuda-diffusion/cuda_discretization.cu | 179 ++++++++++--------- gpu-opencl-diffusion/kernel_boundary.cl | 32 ++-- gpu-opencl-diffusion/kernel_convolution.cl | 51 +++--- gpu-opencl-diffusion/kernel_diffusion.cl | 2 +- gpu-opencl-diffusion/opencl_boundaries.c | 4 +- gpu-opencl-diffusion/opencl_data.c | 3 +- gpu-opencl-diffusion/opencl_data.h | 2 +- gpu-opencl-diffusion/opencl_discretization.c | 34 ++-- gpu-opencl-diffusion/opencl_kernels.h | 44 ----- gpu-opencl-diffusion/opencl_main.c | 2 +- 11 files changed, 164 insertions(+), 190 deletions(-) delete mode 100644 gpu-opencl-diffusion/opencl_kernels.h diff --git a/common-diffusion/output.c b/common-diffusion/output.c index 3f4204f..f305f74 100644 --- a/common-diffusion/output.c +++ b/common-diffusion/output.c @@ -29,7 +29,6 @@ #include #include "output.h" -/* void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, fp_t* dx, fp_t* dy, fp_t* D, fp_t* linStab, int* steps, int* checks) */ void param_parser(int argc, char* argv[], int* bx, int* by, int* checks, int* code, fp_t* D, fp_t* dx, fp_t* dy, fp_t* linStab, int* nm, int* nx, int* ny, int* steps) { FILE * input; diff --git a/gpu-cuda-diffusion/cuda_discretization.cu b/gpu-cuda-diffusion/cuda_discretization.cu index b5bf596..3a68476 100644 --- a/gpu-cuda-diffusion/cuda_discretization.cu +++ b/gpu-cuda-diffusion/cuda_discretization.cu @@ -46,56 +46,53 @@ __global__ void convolution_kernel(fp_t* d_conc_old, const int ny, const int nm) { - int i, j, y; - int dst_til_y, dst_til_x, dst_til_nx, dst_til_ny; - int src_til_y, src_til_x, src_til_nx, src_til_ny; - int sha_til_x, sha_til_y, sha_til_nx; + int i, j; + int dst_x, dst_y, dst_nx, dst_ny; + int src_x, src_y, src_nx, src_ny; + int til_x, til_y, til_nx; fp_t value=0.; - /* source and shared tile width include the halo cells */ - src_til_nx = blockDim.x; - src_til_ny = blockDim.y; - sha_til_nx = src_til_nx; + /* source and tile width include the halo cells */ + src_nx = blockDim.x; + src_ny = blockDim.y; + til_nx = src_nx; - /* destination tile width excludes the halo cells */ - dst_til_nx = src_til_nx - nm + 1; - dst_til_ny = src_til_ny - nm + 1; + /* destination width excludes the halo cells */ + dst_nx = src_nx - nm + 1; + dst_ny = src_ny - nm + 1; - /* determine indices on which to operate */ - sha_til_x = threadIdx.x; - sha_til_y = threadIdx.y; + /* determine tile indices on which to operate */ + til_x = threadIdx.x; + til_y = threadIdx.y; - dst_til_x = blockIdx.x * dst_til_nx + sha_til_x; - dst_til_y = blockIdx.y * dst_til_ny + sha_til_y; + dst_x = blockIdx.x * dst_nx + til_x; + dst_y = blockIdx.y * dst_ny + til_y; - src_til_x = dst_til_x - nm/2; - src_til_y = dst_til_y - nm/2; + src_x = dst_x - nm/2; + src_y = dst_y - nm/2; /* copy tile: __shared__ gives access to all threads working on this tile */ extern __shared__ fp_t d_conc_tile[]; - if (src_til_y >= 0 && src_til_y < ny && - src_til_x >= 0 && src_til_x < nx) { - /* if src_til_y==0, then dst_til_y==nm/2: this is a halo row */ - d_conc_tile[sha_til_y * sha_til_nx + sha_til_x] = d_conc_old[src_til_y * nx + src_til_x]; - } else { - d_conc_tile[sha_til_y * sha_til_nx + sha_til_x] = 0.; + if (src_x >= 0 && src_x < nx && + src_y >= 0 && src_y < ny ) { + /* if src_y==0, then dst_y==nm/2: this is a halo row */ + d_conc_tile[til_nx * til_y + til_x] = d_conc_old[nx * src_y + src_x]; } /* tile data is shared: wait for all threads to finish copying */ __syncthreads(); /* compute the convolution */ - if (sha_til_x < dst_til_nx && sha_til_y < dst_til_ny) { + if (til_x < dst_nx && til_y < dst_ny) { for (j = 0; j < nm; j++) { - y = (j+sha_til_y) * sha_til_nx; for (i = 0; i < nm; i++) { - value += d_mask[j * nm + i] * d_conc_tile[y + i + sha_til_x]; + value += d_mask[j * nm + i] * d_conc_tile[til_nx * (til_y+j) + til_x+i]; } } /* record value */ - if (dst_til_y < ny && dst_til_x < nx) { - d_conc_lap[dst_til_y * nx + dst_til_x] = value; + if (dst_y < ny && dst_x < nx) { + d_conc_lap[nx * dst_y + dst_x] = value; } } @@ -103,6 +100,56 @@ __global__ void convolution_kernel(fp_t* d_conc_old, __syncthreads(); } +/** + \brief Reference showing how to invoke the convolution kernel. + + A stand-alone function like this incurs the cost of host-to-device data + transfer each time it is called: it is a teaching tool, not reusable code. + It is the basis for cuda_diffusion_solver(), which achieves much better + performance by bundling CUDA kernels together and intelligently managing + data transfers between the host (CPU) and device (GPU). +*/ +void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, + int bx, int by, + int nm, + int nx, int ny) +{ + fp_t* d_conc_old, *d_conc_lap; + + /* allocate memory on device */ + cudaMalloc((void **) &d_conc_old, nx * ny * sizeof(fp_t)); + cudaMalloc((void **) &d_conc_lap, nx * ny * sizeof(fp_t)); + + /* divide matrices into blocks of TILE_W * TILE_H threads */ + dim3 tile_size(bx, + by, + 1); + dim3 num_tiles(ceil(float(nx) / (tile_size.x - nm + 1)), + ceil(float(ny) / (tile_size.y - nm + 1)), + 1); + size_t buf_size = (tile_size.x + nm) * (tile_size.x + nm) * sizeof(fp_t); + + /* transfer mask in to constant device memory */ + cudaMemcpyToSymbol(d_mask, mask_lap[0], nm * nm * sizeof(fp_t)); + + /* transfer data from host in to device */ + cudaMemcpy(d_conc_old, conc_old[0], nx * ny * sizeof(fp_t), + cudaMemcpyHostToDevice); + + /* compute Laplacian */ + convolution_kernel<<>> ( + d_conc_old, d_conc_lap, nx, ny, nm + ); + + /* transfer from device out to host */ + cudaMemcpy(conc_lap[0], d_conc_lap, nx * ny * sizeof(fp_t), + cudaMemcpyDeviceToHost); + + /* free memory on device */ + cudaFree(d_conc_old); + cudaFree(d_conc_lap); +} + __global__ void diffusion_kernel(fp_t* d_conc_old, fp_t* d_conc_new, fp_t* d_conc_lap, @@ -123,14 +170,24 @@ __global__ void diffusion_kernel(fp_t* d_conc_old, /* explicit Euler solution to the equation of motion */ if (x < nx && y < ny) { - d_conc_new[y * nx + x] = d_conc_old[y * nx + x] - + dt * D * d_conc_lap[y * nx + x]; + d_conc_new[nx * y + x] = d_conc_old[nx * y + x] + + dt * D * d_conc_lap[nx * y + x]; } /* wait for all threads to finish writing */ __syncthreads(); } +/** + \brief Optimized code for solving the diffusion equation. + + Compare cuda_diffusion_solver(): it accomplishes the same result, but without + the memory allocation, data transfer, and array release. These are handled in + cuda_init(), with arrays on the host and device managed through CudaData, + which is a struct passed by reference into the function. In this way, + device kernels can be called in isolation without incurring the cost of data + transfers and with reduced risk of memory leaks. +*/ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new, fp_t bc[2][2], int bx, int by, int nm, int nx, int ny, @@ -140,13 +197,14 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new, double start_time; int check=0; - /* divide matrices into blocks of TILE_W *TILE_H threads */ - dim3 tile_size(TILE_W, - TILE_H, + /* divide matrices into blocks of bx * by threads */ + dim3 tile_size(bx, + by, 1); dim3 num_tiles(ceil(float(nx) / (tile_size.x - nm + 1)), ceil(float(ny) / (tile_size.y - nm + 1)), 1); + size_t buf_size = (tile_size.x + nm) * (tile_size.y + nm) * sizeof(fp_t); for (check = 0; check < checks; check++) { /* apply boundary conditions */ @@ -156,7 +214,7 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new, /* compute Laplacian */ start_time = GetTimer(); - convolution_kernel<<>> ( + convolution_kernel<<>> ( dev->conc_old, dev->conc_lap, nx, ny, nm ); sw->conv += GetTimer() - start_time; @@ -169,12 +227,11 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new, sw->step += GetTimer() - start_time; swap_pointers_1D(&(dev->conc_old), &(dev->conc_new)); - - *elapsed += dt; } - /* after swap, new data is in dev->conc_old */ - /* transfer from device out to host (conc_new) */ + *elapsed += dt * checks; + + /* transfer result to host (conc_new) from device (dev->conc_old) */ start_time = GetTimer(); cudaMemcpy(conc_new[0], dev->conc_old, nx * ny * sizeof(fp_t), cudaMemcpyDeviceToHost); @@ -229,47 +286,3 @@ void check_solution(fp_t** conc_new, fp_t** conc_lap, int nx, int ny, *rss = sum; } -void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, - int bx, int by, - int nm, - int nx, int ny) -{ - /* If you must compute the convolution separately, do so here. - * It is strongly recommended that you roll CUDA tasks into one function: - * This legacy function is included to show basic usage of the kernel. - */ - - fp_t* d_conc_old, *d_conc_lap; - - /* allocate memory on device */ - cudaMalloc((void **) &d_conc_old, nx * ny * sizeof(fp_t)); - cudaMalloc((void **) &d_conc_lap, nx * ny * sizeof(fp_t)); - - /* divide matrices into blocks of TILE_W * TILE_H threads */ - dim3 tile_size(TILE_W, - TILE_H, - 1); - dim3 num_tiles(ceil(float(nx) / (tile_size.x - nm + 1)), - ceil(float(ny) / (tile_size.y - nm + 1)), - 1); - - /* transfer mask in to constant device memory */ - cudaMemcpyToSymbol(d_mask, mask_lap[0], nm * nm * sizeof(fp_t)); - - /* transfer data from host in to device */ - cudaMemcpy(d_conc_old, conc_old[0], nx * ny * sizeof(fp_t), - cudaMemcpyHostToDevice); - - /* compute Laplacian */ - convolution_kernel<<>> ( - d_conc_old, d_conc_lap, nx, ny, nm - ); - - /* transfer from device out to host */ - cudaMemcpy(conc_lap[0], d_conc_lap, nx * ny * sizeof(fp_t), - cudaMemcpyDeviceToHost); - - /* free memory on device */ - cudaFree(d_conc_old); - cudaFree(d_conc_lap); -} diff --git a/gpu-opencl-diffusion/kernel_boundary.cl b/gpu-opencl-diffusion/kernel_boundary.cl index 9842f79..c7e65a5 100644 --- a/gpu-opencl-diffusion/kernel_boundary.cl +++ b/gpu-opencl-diffusion/kernel_boundary.cl @@ -22,7 +22,7 @@ */ #pragma OPENCL EXTENSION cl_khr_fp64: enable -#include "opencl_kernels.h" +#include "numerics.h" /** \brief Boundary condition kernel for execution on the GPU @@ -37,22 +37,22 @@ __kernel void boundary_kernel(__global fp_t* d_conc, int ny, int nm) { - int col, row; + int x, y; int ihi, ilo, jhi, jlo, offset; /* determine indices on which to operate */ - col = get_global_id(0); - row = get_global_id(1); + x = get_global_id(0); + y = get_global_id(1); /* apply fixed boundary values: sequence does not matter */ - if (row < ny/2 && col < 1+nm/2) { - d_conc[row * nx + col] = d_bc[2]; /* left value, bc[1][0] = bc[2*1 + 0] */ + if (x < 1+nm/2 && y < ny/2) { + d_conc[nx * y + x] = d_bc[2]; /* left value, bc[1][0] = bc[2*1 + 0] */ } - if (row >= ny/2 && row < ny && col >= nx-1-nm/2 && col < nx) { - d_conc[row * nx + col] = d_bc[3]; /* right value, bc[1][1] = bc[2*1 + 1] */ + if (x >= nx-1-nm/2 && x < nx && y >= ny/2 && y < ny) { + d_conc[nx * y + x] = d_bc[3]; /* right value, bc[1][1] = bc[2*1 + 1] */ } /* wait for all threads to finish writing */ @@ -66,17 +66,17 @@ __kernel void boundary_kernel(__global fp_t* d_conc, jlo = nm/2 - offset; jhi = ny - 1 - nm/2 + offset; - if (ilo-1 == col && row < ny) { - d_conc[row * nx + ilo-1] = d_conc[row * nx + ilo]; /* left condition */ + if (ilo-1 == x && y < ny) { + d_conc[nx * y + ilo-1] = d_conc[nx * y + ilo]; /* left condition */ } - if (ihi+1 == col && row < ny) { - d_conc[row * nx + ihi+1] = d_conc[row * nx + ihi]; /* right condition */ + if (ihi+1 == x && y < ny) { + d_conc[nx * y + ihi+1] = d_conc[nx * y + ihi]; /* right condition */ } - if (jlo-1 == row && col < nx) { - d_conc[(jlo-1) * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */ + if (jlo-1 == y && x < nx) { + d_conc[nx * (jlo-1) + x] = d_conc[nx * jlo + x]; /* bottom condition */ } - if (jhi+1 == row && col < nx) { - d_conc[(jhi+1) * nx + col] = d_conc[jhi * nx + col]; /* top condition */ + if (jhi+1 == y && x < nx) { + d_conc[nx * (jhi+1) + x] = d_conc[nx * jhi + x]; /* top condition */ } barrier(CLK_GLOBAL_MEM_FENCE); diff --git a/gpu-opencl-diffusion/kernel_convolution.cl b/gpu-opencl-diffusion/kernel_convolution.cl index 291cd98..a8da030 100644 --- a/gpu-opencl-diffusion/kernel_convolution.cl +++ b/gpu-opencl-diffusion/kernel_convolution.cl @@ -22,7 +22,7 @@ */ #pragma OPENCL EXTENSION cl_khr_fp64: enable -#include "opencl_kernels.h" +#include "numerics.h" /** \brief Tiled convolution algorithm for execution on the GPU @@ -44,57 +44,56 @@ - The \a __local specifier allocates the small \a d_conc_tile array in cache - The \a __constant specifier allocates the small \a d_mask array in cache */ -__kernel void convolution_kernel(__global fp_t* d_conc_old, - __global fp_t* d_conc_lap, +__kernel void convolution_kernel(__global fp_t* d_conc_old, + __global fp_t* d_conc_lap, __constant fp_t* d_mask, + __local fp_t* d_conc_tile, int nx, int ny, int nm) { int i, j; - int dst_col, dst_cols, dst_row, dst_rows; - int src_col, src_cols, src_row, src_rows; - int til_col, til_row; + int dst_nx, dst_ny, dst_x, dst_y; + int src_nx, src_ny, src_x, src_y; + int til_nx, til_x, til_y; fp_t value = 0.; /* source tile includes the halo cells, destination tile does not */ - src_cols = get_local_size(0); - src_rows = get_local_size(1); + src_ny = get_local_size(0); + src_nx = get_local_size(1); + til_nx = src_nx; - dst_cols = src_cols - nm + 1; - dst_rows = src_rows - nm + 1; + dst_ny = src_ny - nm + 1; + dst_nx = src_nx - nm + 1; /* determine indices on which to operate */ - til_col = get_local_id(0); - til_row = get_local_id(1); + til_x = get_local_id(0); + til_y = get_local_id(1); - dst_col = get_group_id(0) * dst_cols + til_col; - dst_row = get_group_id(1) * dst_rows + til_row; + dst_x = get_group_id(0) * dst_ny + til_x; + dst_y = get_group_id(1) * dst_nx + til_y; - src_col = dst_col - nm/2; - src_row = dst_row - nm/2; + src_x = dst_x - nm/2; + src_y = dst_y - nm/2; - /* shared memory tile: __local gives access to all threads in the group */ - __local fp_t d_conc_tile[TILE_H + MAX_MASK_H - 1][TILE_W + MAX_MASK_W - 1]; - - if (src_row >= 0 && src_row < ny && - src_col >= 0 && src_col < nx) { - d_conc_tile[til_row][til_col] = d_conc_old[src_row * nx + src_col]; + if (src_x >= 0 && src_x < nx && + src_y >= 0 && src_y < ny) { + d_conc_tile[til_nx * til_y + til_x] = d_conc_old[nx * src_y + src_x]; } /* tile data is shared: wait for all threads to finish copying */ barrier(CLK_LOCAL_MEM_FENCE); /* compute the convolution */ - if (til_col < dst_cols && til_row < dst_rows) { + if (til_x < dst_ny && til_y < dst_nx) { for (j = 0; j < nm; j++) { for (i = 0; i < nm; i++) { - value += d_mask[j * nm + i] * d_conc_tile[j+til_row][i+til_col]; + value += d_mask[nm * j + i] * d_conc_tile[til_nx * (til_y+j) + til_x+i]; } } /* record value */ - if (dst_row < ny && dst_col < nx) { - d_conc_lap[dst_row * nx + dst_col] = value; + if (dst_y < ny && dst_x < nx) { + d_conc_lap[nx * dst_y + dst_x] = value; } } diff --git a/gpu-opencl-diffusion/kernel_diffusion.cl b/gpu-opencl-diffusion/kernel_diffusion.cl index 136b143..91e3f64 100644 --- a/gpu-opencl-diffusion/kernel_diffusion.cl +++ b/gpu-opencl-diffusion/kernel_diffusion.cl @@ -22,7 +22,7 @@ */ #pragma OPENCL EXTENSION cl_khr_fp64: enable -#include "opencl_kernels.h" +#include "numerics.h" /** \brief Diffusion equation kernel for execution on the GPU diff --git a/gpu-opencl-diffusion/opencl_boundaries.c b/gpu-opencl-diffusion/opencl_boundaries.c index b90fd84..060883b 100644 --- a/gpu-opencl-diffusion/opencl_boundaries.c +++ b/gpu-opencl-diffusion/opencl_boundaries.c @@ -25,10 +25,8 @@ #include #include #include - #include "boundaries.h" - -#include "opencl_kernels.h" +#include "numerics.h" void set_boundaries(fp_t bc[2][2]) { diff --git a/gpu-opencl-diffusion/opencl_data.c b/gpu-opencl-diffusion/opencl_data.c index 70cbbee..5d8f8be 100644 --- a/gpu-opencl-diffusion/opencl_data.c +++ b/gpu-opencl-diffusion/opencl_data.c @@ -26,9 +26,8 @@ #include #include #include - +#include "numerics.h" #include "opencl_data.h" -#include "opencl_kernels.h" void report_error(cl_int status, const char* message) { diff --git a/gpu-opencl-diffusion/opencl_data.h b/gpu-opencl-diffusion/opencl_data.h index eab8dd6..723041d 100644 --- a/gpu-opencl-diffusion/opencl_data.h +++ b/gpu-opencl-diffusion/opencl_data.h @@ -112,7 +112,7 @@ void init_opencl(fp_t** conc_old, fp_t** mask_lap, fp_t bc[2][2], \brief Specialization of solve_diffusion_equation() using OpenCL */ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new, - int nx, int ny, int nm, + int bx, int by, int nx, int ny, int nm, fp_t D, fp_t dt, int checks, fp_t *elapsed, struct Stopwatch* sw); diff --git a/gpu-opencl-diffusion/opencl_discretization.c b/gpu-opencl-diffusion/opencl_discretization.c index d9cdf65..9093c0d 100644 --- a/gpu-opencl-diffusion/opencl_discretization.c +++ b/gpu-opencl-diffusion/opencl_discretization.c @@ -28,11 +28,12 @@ #include "boundaries.h" #include "discretization.h" #include "mesh.h" +#include "numerics.h" #include "timer.h" #include "opencl_data.h" -#include "opencl_kernels.h" void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new, + int bx, int by, int nx, int ny, int nm, fp_t D, fp_t dt, int checks, fp_t *elapsed, struct Stopwatch* sw) @@ -46,12 +47,12 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new, power of two: 4, 8, 16, 32, etc. OpenCL will make a best-guess optimal block size if you set size_t* tile_dim = NULL. */ - size_t tile_dim[2] = {TILE_W, - TILE_H}; + size_t tile_dim[2] = {bx, by}; size_t bloc_dim[2] = {ceil((float)(nx) / (tile_dim[0] - nm + 1)), ceil((float)(ny) / (tile_dim[1] - nm + 1))}; size_t grid_dim[2] = {bloc_dim[0] * tile_dim[0], bloc_dim[1] * tile_dim[1]}; + size_t buf_size = (tile_dim[0] + nm) * (tile_dim[1] + nm) * sizeof(fp_t); cl_mem d_conc_old = dev->conc_old; cl_mem d_conc_new = dev->conc_new; @@ -67,9 +68,10 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new, status |= clSetKernelArg(dev->convolution_kernel, 1, sizeof(cl_mem), (void *)&(dev->conc_lap)); status |= clSetKernelArg(dev->convolution_kernel, 2, sizeof(cl_mem), (void *)&(dev->mask)); - status |= clSetKernelArg(dev->convolution_kernel, 3, sizeof(int), (void *)&nx); - status |= clSetKernelArg(dev->convolution_kernel, 4, sizeof(int), (void *)&ny); - status |= clSetKernelArg(dev->convolution_kernel, 5, sizeof(int), (void *)&nm); + status |= clSetKernelArg(dev->convolution_kernel, 3, buf_size, NULL); + status |= clSetKernelArg(dev->convolution_kernel, 4, sizeof(int), (void *)&nx); + status |= clSetKernelArg(dev->convolution_kernel, 5, sizeof(int), (void *)&ny); + status |= clSetKernelArg(dev->convolution_kernel, 6, sizeof(int), (void *)&nm); report_error(status, "constant convolution kernel args"); status |= clSetKernelArg(dev->diffusion_kernel, 2, sizeof(cl_mem), (void *)&(dev->conc_lap)); @@ -94,25 +96,33 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new, } /* set time-dependent kernel arguments */ - status = clSetKernelArg(dev->boundary_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old); + status = clSetKernelArg(dev->boundary_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old); report_error(status, "mutable boundary kernel args"); status = clSetKernelArg(dev->convolution_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old); report_error(status, "mutable convolution kernel args"); - status |= clSetKernelArg(dev->diffusion_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old); - status |= clSetKernelArg(dev->diffusion_kernel, 1, sizeof(cl_mem), (void *)&d_conc_new); + status |= clSetKernelArg(dev->diffusion_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old); + status |= clSetKernelArg(dev->diffusion_kernel, 1, sizeof(cl_mem), (void *)&d_conc_new); report_error(status, "mutable diffusion kernel args"); - /* enqueue kernels */ + /* apply boundary conditions */ status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->boundary_kernel, 2, NULL, grid_dim, tile_dim, 0, NULL, NULL); + + /* compute Laplacian */ + start_time = GetTimer(); status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->convolution_kernel, 2, NULL, grid_dim, tile_dim, 0, NULL, NULL); + sw->conv += GetTimer() - start_time; + + /* compute result */ + start_time = GetTimer(); status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel, 2, NULL, grid_dim, tile_dim, 0, NULL, NULL); - report_error(status, "enqueue kernels"); + sw->step += GetTimer() - start_time; - *elapsed += dt; + report_error(status, "enqueue kernels"); } + *elapsed += dt * checks; /* transfer from device out to host */ start_time = GetTimer(); diff --git a/gpu-opencl-diffusion/opencl_kernels.h b/gpu-opencl-diffusion/opencl_kernels.h deleted file mode 100644 index 2afc6ca..0000000 --- a/gpu-opencl-diffusion/opencl_kernels.h +++ /dev/null @@ -1,44 +0,0 @@ -/********************************************************************************** - HiPerC: High Performance Computing Strategies for Boundary Value Problems - written by Trevor Keller and available from https://github.com/usnistgov/hiperc - - This software was developed at the National Institute of Standards and Technology - by employees of the Federal Government in the course of their official duties. - Pursuant to title 17 section 105 of the United States Code this software is not - subject to copyright protection and is in the public domain. NIST assumes no - responsibility whatsoever for the use of this software by other parties, and makes - no guarantees, expressed or implied, about its quality, reliability, or any other - characteristic. We would appreciate acknowledgement if the software is used. - - This software can be redistributed and/or modified freely provided that any - derivative works bear some notice that they are derived from it, and any modified - versions bear some notice that they have been modified. - - Questions/comments to Trevor Keller (trevor.keller@nist.gov) - **********************************************************************************/ - -/** - \file opencl_kernels.h - \brief Declaration of functions to execute on the GPU (OpenCL kernels) -*/ - -/** \cond SuppressGuard */ -#ifndef _OPENCL_KERNELS_H_ -#define _OPENCL_KERNELS_H_ -/** \endcond */ - -#include "numerics.h" - -/** - \brief Width of an input tile, including halo cells, for GPU memory allocation -*/ -#define TILE_W 32 - -/** - \brief Height of an input tile, including halo cells, for GPU memory allocation -*/ -#define TILE_H 32 - -/** \cond SuppressGuard */ -#endif /* _OPENCL_KERNELS_H_ */ -/** \endcond */ diff --git a/gpu-opencl-diffusion/opencl_main.c b/gpu-opencl-diffusion/opencl_main.c index 245fa8f..7a95d7d 100644 --- a/gpu-opencl-diffusion/opencl_main.c +++ b/gpu-opencl-diffusion/opencl_main.c @@ -109,7 +109,7 @@ int main(int argc, char* argv[]) assert(step + checks <= steps); - opencl_diffusion_solver(&dev, conc_new, nx, ny, nm, D, dt, checks, &elapsed, &sw); + opencl_diffusion_solver(&dev, conc_new, bx, by, nx, ny, nm, D, dt, checks, &elapsed, &sw); for (i = 0; i < checks; i++) { step++; From 4c78f9f23e3f359851b3cd7b53d03bce7f5bf1aa Mon Sep 17 00:00:00 2001 From: Trevor Keller Date: Tue, 17 Oct 2017 18:15:42 -0400 Subject: [PATCH 4/4] Fix usage of OpenAcc data flow controls; 4x slower than tiled GPU algorithms (CUDA, OpenCL) Note: apply `#pragma acc declare present` in routines. Closes #49. --- gpu-openacc-diffusion/Makefile | 2 +- gpu-openacc-diffusion/openacc_boundaries.c | 59 ++++++++++--------- .../openacc_discretization.c | 24 +++----- 3 files changed, 39 insertions(+), 46 deletions(-) diff --git a/gpu-openacc-diffusion/Makefile b/gpu-openacc-diffusion/Makefile index 452a19e..02f3da8 100644 --- a/gpu-openacc-diffusion/Makefile +++ b/gpu-openacc-diffusion/Makefile @@ -2,7 +2,7 @@ # OpenACC implementation CXX = pgcc -CXXFLAGS = -O3 -I../common-diffusion -acc -ta=tesla -Minfo=accel -mp +CXXFLAGS = -O3 -I../common-diffusion -acc -ta=tesla -ta=tesla:cc30 -ta=tesla:cc50 -ta=tesla:cc60 -Minfo=accel -mp LINKS = -lm -lpng OBJS = boundaries.o discretization.o mesh.o numerics.o output.o timer.o diff --git a/gpu-openacc-diffusion/openacc_boundaries.c b/gpu-openacc-diffusion/openacc_boundaries.c index 6627782..d36a309 100644 --- a/gpu-openacc-diffusion/openacc_boundaries.c +++ b/gpu-openacc-diffusion/openacc_boundaries.c @@ -59,47 +59,48 @@ void apply_initial_conditions(fp_t** conc, int nx, int ny, int nm, fp_t bc[2][2] } } -void boundary_kernel(fp_t** conc, int nx, int ny, int nm, fp_t bc[2][2]) +void boundary_kernel(fp_t** __restrict__ conc, int nx, int ny, int nm, fp_t bc[2][2]) { /* apply fixed boundary values: sequence does not matter */ - #pragma acc parallel + #pragma acc declare present(conc[0:ny][0:nx], bc[0:2][0:2]) { - #pragma acc loop - for (int j = 0; j < ny/2; j++) { - #pragma acc loop - for (int i = 0; i < 1+nm/2; i++) { - conc[j][i] = bc[1][0]; /* left value */ + #pragma acc parallel + { + #pragma acc loop independent collapse(2) + for (int j = 0; j < ny/2; j++) { + for (int i = 0; i < 1+nm/2; i++) { + conc[j][i] = bc[1][0]; /* left value */ + } } - } - #pragma acc loop - for (int j = ny/2; j < ny; j++) { - #pragma acc loop - for (int i = nx-1-nm/2; i < nx; i++) { - conc[j][i] = bc[1][1]; /* right value */ + #pragma acc loop independent collapse(2) + for (int j = ny/2; j < ny; j++) { + for (int i = nx-1-nm/2; i < nx; i++) { + conc[j][i] = bc[1][1]; /* right value */ + } } } /* apply no-flux boundary conditions: inside to out, sequence matters */ for (int offset = 0; offset < nm/2; offset++) { - int ilo = nm/2 - offset; - int ihi = nx - 1 - nm/2 + offset; - #pragma acc loop - for (int j = 0; j < ny; j++) { - conc[j][ilo-1] = conc[j][ilo]; /* left condition */ - conc[j][ihi+1] = conc[j][ihi]; /* right condition */ - } - } - - for (int offset = 0; offset < nm/2; offset++) { - int jlo = nm/2 - offset; - int jhi = ny - 1 - nm/2 + offset; - #pragma acc loop - for (int i = 0; i < nx; i++) { - conc[jlo-1][i] = conc[jlo][i]; /* bottom condition */ - conc[jhi+1][i] = conc[jhi][i]; /* top condition */ + #pragma acc parallel + { + int ilo = nm/2 - offset; + int ihi = nx - 1 - nm/2 + offset; + int jlo = nm/2 - offset; + int jhi = ny - 1 - nm/2 + offset; + #pragma acc loop independent + for (int j = 0; j < ny; j++) { + conc[j][ilo-1] = conc[j][ilo]; /* left condition */ + conc[j][ihi+1] = conc[j][ihi]; /* right condition */ + } + #pragma acc loop independent + for (int i = 0; i < nx; i++) { + conc[jlo-1][i] = conc[jlo][i]; /* bottom condition */ + conc[jhi+1][i] = conc[jhi][i]; /* top condition */ + } } } } diff --git a/gpu-openacc-diffusion/openacc_discretization.c b/gpu-openacc-diffusion/openacc_discretization.c index 677a20b..c984de0 100644 --- a/gpu-openacc-diffusion/openacc_discretization.c +++ b/gpu-openacc-diffusion/openacc_discretization.c @@ -31,13 +31,14 @@ void convolution_kernel(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, int nx, int ny, int nm) { + #pragma acc declare present(conc_old[0:ny][0:nx], conc_lap[0:ny][0:nx], mask_lap[0:nm][0:nm]) #pragma acc parallel { - #pragma acc loop + #pragma acc loop collapse(2) for (int j = nm/2; j < ny-nm/2; j++) { - #pragma acc loop for (int i = nm/2; i < nx-nm/2; i++) { fp_t value = 0.; + #pragma acc loop seq collapse(2) for (int mj = -nm/2; mj < 1+nm/2; mj++) { for (int mi = -nm/2; mi < 1+nm/2; mi++) { value += mask_lap[mj+nm/2][mi+nm/2] * conc_old[j+mj][i+mi]; @@ -52,11 +53,11 @@ void convolution_kernel(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, int n void diffusion_kernel(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap, int nx, int ny, int nm, fp_t D, fp_t dt) { + #pragma acc declare present(conc_old[0:ny][0:nx], conc_new[0:ny][0:nx], conc_lap[0:ny][0:nx]) #pragma acc parallel { - #pragma acc loop + #pragma acc loop collapse(2) for (int j = nm/2; j < ny-nm/2; j++) { - #pragma acc loop for (int i = nm/2; i < nx-nm/2; i++) { conc_new[j][i] = conc_old[j][i] + dt * D * conc_lap[j][i]; } @@ -64,23 +65,14 @@ void diffusion_kernel(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap, } } - -void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, - int nx, int ny, int nm) -{ - /* If you must compute the convolution separately, do so here. */ - #pragma acc data copyin(conc_old[0:ny][0:nx], mask_lap[0:nm][0:nm]) create(conc_lap[0:ny][0:nx]) copyout(conc_lap[0:ny][0:nx]) - { - convolution_kernel(conc_old, conc_lap, mask_lap, nx, ny, nm); - } -} - void solve_diffusion_equation(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap, fp_t** mask_lap, int nx, int ny, int nm, fp_t bc[2][2], fp_t D, fp_t dt, int checks, fp_t* elapsed, struct Stopwatch* sw) { - #pragma acc data copy(conc_old[0:ny][0:nx], mask_lap[0:nm][0:nm], bc[0:2][0:2]) create(conc_lap[0:ny][0:nx], conc_new[0:ny][0:nx]) + #pragma acc data present_or_copy(conc_old[0:ny][0:nx]) \ + present_or_copyin(mask_lap[0:nm][0:nm], bc[0:2][0:2]) \ + present_or_create(conc_lap[0:ny][0:nx], conc_new[0:ny][0:nx]) { double start_time=0.; int check=0;