From 45075008307ce14e6b381ac2196c53a2db574966 Mon Sep 17 00:00:00 2001 From: Trevor Keller Date: Thu, 21 Sep 2017 09:45:31 -0400 Subject: [PATCH] streamlining --- common-diffusion/params.txt | 6 +- gpu-cuda-diffusion/cuda_boundaries.cu | 28 ++++---- gpu-opencl-diffusion/Makefile | 5 +- gpu-opencl-diffusion/kernel_boundary.cl | 35 +++++---- gpu-opencl-diffusion/kernel_convolution.cl | 35 +++++---- gpu-opencl-diffusion/opencl_data.c | 6 ++ gpu-opencl-diffusion/opencl_discretization.c | 76 ++++++++------------ 7 files changed, 89 insertions(+), 102 deletions(-) diff --git a/common-diffusion/params.txt b/common-diffusion/params.txt index 5bed383..119c5b8 100644 --- a/common-diffusion/params.txt +++ b/common-diffusion/params.txt @@ -1,7 +1,9 @@ -nx 512 # mesh points along x-axis -ny 512 # mesh points along y-axis +nx 512 # total mesh points along x-axis +ny 512 # total mesh points along y-axis dx 0.5 # mesh resolution along x-axis dy 0.5 # mesh resolution along y-axis +bx 32 # convolution block size along x-axis +by 32 # convolution block size along y-axis ns 100000 # number of timesteps to march nc 10000 # number of timesteps between checkpoint outputs dc 0.00625 # diffusion coefficient diff --git a/gpu-cuda-diffusion/cuda_boundaries.cu b/gpu-cuda-diffusion/cuda_boundaries.cu index 9430d86..e29d913 100644 --- a/gpu-cuda-diffusion/cuda_boundaries.cu +++ b/gpu-cuda-diffusion/cuda_boundaries.cu @@ -83,7 +83,7 @@ __global__ void boundary_kernel(fp_t* d_conc, /* apply fixed boundary values: sequence does not matter */ - if (row >= 0 && row < ny/2 && col >= 0 && col < 1+nm/2) { + if (row < ny/2 && col < 1+nm/2) { d_conc[row * nx + col] = d_bc[1][0]; /* left value */ } @@ -99,22 +99,22 @@ __global__ void boundary_kernel(fp_t* d_conc, for (offset = 0; offset < nm/2; offset++) { ilo = nm/2 - offset; ihi = nx - 1 - nm/2 + offset; - if (col == ilo-1 && row >= 0 && row < ny) { - d_conc[row * nx + col] = d_conc[row * nx + ilo]; /* left condition */ - } else if (col == ihi+1 && row >= 0 && row < ny) { - d_conc[row * nx + col] = d_conc[row * nx + ihi]; /* right condition */ - } - __syncthreads(); - } - - for (offset = 0; offset < nm/2; offset++) { jlo = nm/2 - offset; jhi = ny - 1 - nm/2 + offset; - if (row == jlo-1 && col >= 0 && col < nx) { - d_conc[row * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */ - } else if (row == jhi+1 && col >= 0 && col < nx) { - d_conc[row * nx + col] = d_conc[jhi * nx + col]; /* top condition */ + + if (ilo-1 == col && row < ny) { + d_conc[row * nx + ilo-1] = d_conc[row * nx + ilo]; /* left condition */ + } + if (ihi+1 == col && row < ny) { + d_conc[row * nx + ihi+1] = d_conc[row * nx + ihi]; /* right condition */ + } + if (jlo-1 == row && col < nx) { + d_conc[(jlo-1) * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */ } + if (jhi+1 == row && col < nx) { + d_conc[(jhi+1) * nx + col] = d_conc[jhi * nx + col]; /* top condition */ + } + __syncthreads(); } } diff --git a/gpu-opencl-diffusion/Makefile b/gpu-opencl-diffusion/Makefile index 564dc37..c922473 100644 --- a/gpu-opencl-diffusion/Makefile +++ b/gpu-opencl-diffusion/Makefile @@ -2,16 +2,17 @@ # OpenCL implementation CC = gcc -CFLAGS = -g -Wall -pedantic -std=c11 -I../common-diffusion -fopenmp +CFLAGS = -O3 -Wall -pedantic -std=c11 -I../common-diffusion -fopenmp LINKS = -lm -lpng -lOpenCL LLVM = clang LFLAGS = -S -emit-llvm -x cl -I../common-diffusion +KERNELS = kernel_boundary.cl kernel_convolution.cl kernel_diffusion.cl OBJS = boundaries.o data.o discretization.o mesh.o numerics.o output.o timer.o # Executable -diffusion: opencl_main.c $(OBJS) +diffusion: opencl_main.c $(KERNELS) $(OBJS) $(CC) $(CFLAGS) $(OBJS) $< -o $@ $(LINKS) # OpenCL objects diff --git a/gpu-opencl-diffusion/kernel_boundary.cl b/gpu-opencl-diffusion/kernel_boundary.cl index 466206e..e584623 100644 --- a/gpu-opencl-diffusion/kernel_boundary.cl +++ b/gpu-opencl-diffusion/kernel_boundary.cl @@ -37,23 +37,22 @@ __kernel void boundary_kernel(__global fp_t* d_conc, int ny, int nm) { - int idx, col, row; + int col, row; int ihi, ilo, jhi, jlo, offset; /* determine indices on which to operate */ col = get_global_id(0); row = get_global_id(1); - idx = row * nx + col; /* apply fixed boundary values: sequence does not matter */ - if (row >= 0 && row < ny/2 && col >= 0 && col < 1+nm/2) { - d_conc[idx] = d_bc[2]; /* left value, bc[1][0] = bc[2*1 + 0] */ + if (row < ny/2 && col < 1+nm/2) { + d_conc[row * nx + col] = d_bc[2]; /* left value, bc[1][0] = bc[2*1 + 0] */ } if (row >= ny/2 && row < ny && col >= nx-1-nm/2 && col < nx) { - d_conc[idx] = d_bc[3]; /* right value, bc[1][1] = bc[2*1 + 1] */ + d_conc[row * nx + col] = d_bc[3]; /* right value, bc[1][1] = bc[2*1 + 1] */ } /* wait for all threads to finish writing */ @@ -64,22 +63,22 @@ __kernel void boundary_kernel(__global fp_t* d_conc, for (offset = 0; offset < nm/2; offset++) { ilo = nm/2 - offset; ihi = nx - 1 - nm/2 + offset; - if (col == ilo-1 && row >= 0 && row < ny) { - d_conc[idx] = d_conc[row * nx + ilo]; /* left condition */ - } else if (col == ihi+1 && row >= 0 && row < ny) { - d_conc[idx] = d_conc[row * nx + ihi]; /* right condition */ - } - barrier(CLK_GLOBAL_MEM_FENCE); - } - - for (offset = 0; offset < nm/2; offset++) { jlo = nm/2 - offset; jhi = ny - 1 - nm/2 + offset; - if (row == jlo-1 && col >= 0 && col < nx) { - d_conc[idx] = d_conc[jlo * nx + col]; /* bottom condition */ - } else if (row == jhi+1 && col >= 0 && col < nx) { - d_conc[idx] = d_conc[jhi * nx + col]; /* top condition */ + + if (ilo-1 == col && row < ny) { + d_conc[row * nx + ilo-1] = d_conc[row * nx + ilo]; /* left condition */ + } + if (ihi+1 == col && row < ny) { + d_conc[row * nx + ihi+1] = d_conc[row * nx + ihi]; /* right condition */ + } + if (jlo-1 == row && col < nx) { + d_conc[(jlo-1) * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */ } + if (jhi+1 == row && col < nx) { + d_conc[(jhi+1) * nx + col] = d_conc[jhi * nx + col]; /* top condition */ + } + barrier(CLK_GLOBAL_MEM_FENCE); } } diff --git a/gpu-opencl-diffusion/kernel_convolution.cl b/gpu-opencl-diffusion/kernel_convolution.cl index 3e5ca29..1cafd10 100644 --- a/gpu-opencl-diffusion/kernel_convolution.cl +++ b/gpu-opencl-diffusion/kernel_convolution.cl @@ -51,38 +51,35 @@ __kernel void convolution_kernel(__global fp_t* d_conc_old, int ny, int nm) { - int i, j, til_col, til_row; - int dst_col, dst_idx, dst_row, dst_cols, dst_rows; - int src_col, src_idx, src_row; - fp_t value=0.; + int i, j; + int dst_col, dst_cols, dst_row, dst_rows; + int src_col, src_cols, src_row, src_rows; + int til_col, til_row; + fp_t value = 0.; /* source tile includes the halo cells, destination tile does not */ - dst_cols = get_local_size(0) - nm + 1; - dst_rows = get_local_size(1) - nm + 1; + src_cols = get_local_size(0); + src_rows = get_local_size(1); + + dst_cols = src_cols - nm + 1; + dst_rows = src_rows - nm + 1; /* determine indices on which to operate */ til_col = get_local_id(0); til_row = get_local_id(1); - dst_col = get_group_id(0) * dst_cols + til_col; /* not get_global_id(0); */ - dst_row = get_group_id(1) * dst_rows + til_row; /* not get_global_id(1); */ + dst_col = get_group_id(0) * dst_cols + til_col; + dst_row = get_group_id(1) * dst_rows + til_row; src_col = dst_col - nm/2; src_row = dst_row - nm/2; - dst_idx = dst_row * nx + dst_col; - src_idx = src_row * nx + src_col; - /* shared memory tile: __local gives access to all threads in the group */ __local fp_t d_conc_tile[TILE_H + MAX_MASK_H - 1][TILE_W + MAX_MASK_W - 1]; - if ((src_row >= 0) && (src_row < ny) && - (src_col >= 0) && (src_col < nx)) { - /* if src_row==0, then dst_row==nm/2: this is a halo row */ - d_conc_tile[til_row][til_col] = d_conc_old[src_idx]; - } else { - /* points outside the halo should be switched off */ - d_conc_tile[til_row][til_col] = 0.; + if (src_row >= 0 && src_row < ny && + src_col >= 0 && src_col < nx) { + d_conc_tile[til_row][til_col] = d_conc_old[src_row * nx + src_col]; } /* tile data is shared: wait for all threads to finish copying */ @@ -97,7 +94,7 @@ __kernel void convolution_kernel(__global fp_t* d_conc_old, } /* record value */ if (dst_row < ny && dst_col < nx) { - d_conc_lap[dst_idx] = value; + d_conc_lap[dst_row * nx + dst_col] = value; } } diff --git a/gpu-opencl-diffusion/opencl_data.c b/gpu-opencl-diffusion/opencl_data.c index 860aa7f..2b71153 100644 --- a/gpu-opencl-diffusion/opencl_data.c +++ b/gpu-opencl-diffusion/opencl_data.c @@ -228,6 +228,12 @@ void build_program(const char* filename, void free_opencl(struct OpenCLData* dev) { /* clean up */ + free(dev->conc_old); + free(dev->conc_new); + free(dev->conc_lap); + free(dev->bc); + free(dev->mask); + clReleaseContext(dev->context); clReleaseKernel(dev->boundary_kernel); diff --git a/gpu-opencl-diffusion/opencl_discretization.c b/gpu-opencl-diffusion/opencl_discretization.c index 264a080..8c0a728 100644 --- a/gpu-opencl-diffusion/opencl_discretization.c +++ b/gpu-opencl-diffusion/opencl_discretization.c @@ -55,38 +55,26 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new, cl_int status = CL_SUCCESS; /* set immutable kernel arguments */ - status = clSetKernelArg(dev->boundary_kernel, 1, sizeof(cl_mem), (void *)&(dev->bc)); - report_error(status, "const boundary args[1]"); - status = clSetKernelArg(dev->boundary_kernel, 2, sizeof(int), (void *)&nx); - report_error(status, "const boundary args[2]"); - status = clSetKernelArg(dev->boundary_kernel, 3, sizeof(int), (void *)&ny); - report_error(status, "const boundary args[3]"); - status = clSetKernelArg(dev->boundary_kernel, 4, sizeof(int), (void *)&nm); - report_error(status, "const boundary args[4]"); - - status = clSetKernelArg(dev->convolution_kernel, 1, sizeof(cl_mem), (void *)&(dev->conc_lap)); - report_error(status, "const convolution args[1]"); - status = clSetKernelArg(dev->convolution_kernel, 2, sizeof(cl_mem), (void *)&(dev->mask)); - report_error(status, "const convolution args[2]"); - status = clSetKernelArg(dev->convolution_kernel, 3, sizeof(int), (void *)&nx); - report_error(status, "const convolution args[3]"); - status = clSetKernelArg(dev->convolution_kernel, 4, sizeof(int), (void *)&ny); - report_error(status, "const convolution args[4]"); - status = clSetKernelArg(dev->convolution_kernel, 5, sizeof(int), (void *)&nm); - report_error(status, "const convolution args[5]"); - - status = clSetKernelArg(dev->diffusion_kernel, 2, sizeof(cl_mem), (void *)&(dev->conc_lap)); - report_error(status, "const diffusion args[2]"); - status = clSetKernelArg(dev->diffusion_kernel, 3, sizeof(int), (void *)&nx); - report_error(status, "const diffusion args[3]"); - status = clSetKernelArg(dev->diffusion_kernel, 4, sizeof(int), (void *)&ny); - report_error(status, "const diffusion args[4]"); - status = clSetKernelArg(dev->diffusion_kernel, 5, sizeof(int), (void *)&nm); - report_error(status, "const diffusion args[5]"); - status = clSetKernelArg(dev->diffusion_kernel, 6, sizeof(fp_t), (void *)&D); - report_error(status, "const diffusion args[6]"); - status = clSetKernelArg(dev->diffusion_kernel, 7, sizeof(fp_t), (void *)&dt); - report_error(status, "const diffusion args[7]"); + status |= clSetKernelArg(dev->boundary_kernel, 1, sizeof(cl_mem), (void *)&(dev->bc)); + status |= clSetKernelArg(dev->boundary_kernel, 2, sizeof(int), (void *)&nx); + status |= clSetKernelArg(dev->boundary_kernel, 3, sizeof(int), (void *)&ny); + status |= clSetKernelArg(dev->boundary_kernel, 4, sizeof(int), (void *)&nm); + report_error(status, "constant boundary kernal args"); + + status |= clSetKernelArg(dev->convolution_kernel, 1, sizeof(cl_mem), (void *)&(dev->conc_lap)); + status |= clSetKernelArg(dev->convolution_kernel, 2, sizeof(cl_mem), (void *)&(dev->mask)); + status |= clSetKernelArg(dev->convolution_kernel, 3, sizeof(int), (void *)&nx); + status |= clSetKernelArg(dev->convolution_kernel, 4, sizeof(int), (void *)&ny); + status |= clSetKernelArg(dev->convolution_kernel, 5, sizeof(int), (void *)&nm); + report_error(status, "constant convolution kernel args"); + + status |= clSetKernelArg(dev->diffusion_kernel, 2, sizeof(cl_mem), (void *)&(dev->conc_lap)); + status |= clSetKernelArg(dev->diffusion_kernel, 3, sizeof(int), (void *)&nx); + status |= clSetKernelArg(dev->diffusion_kernel, 4, sizeof(int), (void *)&ny); + status |= clSetKernelArg(dev->diffusion_kernel, 5, sizeof(int), (void *)&nm); + status |= clSetKernelArg(dev->diffusion_kernel, 6, sizeof(fp_t), (void *)&D); + status |= clSetKernelArg(dev->diffusion_kernel, 7, sizeof(fp_t), (void *)&dt); + report_error(status, "constant diffusion kernel args"); /* OpenCL uses cl_mem, not fp_t*, so swap_pointers won't work. * We leave the pointers alone but call the kernel on the appropriate data location. @@ -103,26 +91,20 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new, /* set time-dependent kernel arguments */ status = clSetKernelArg(dev->boundary_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old); - report_error(status, "mutable boundary args[0]"); + report_error(status, "mutable boundary kernel args"); status = clSetKernelArg(dev->convolution_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old); - report_error(status, "mutable convolution args[0]"); + report_error(status, "mutable convolution kernel args"); - status = clSetKernelArg(dev->diffusion_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old); - report_error(status, "mutable diffusion args[0]"); - - status = clSetKernelArg(dev->diffusion_kernel, 1, sizeof(cl_mem), (void *)&d_conc_new); - report_error(status, "mutable diffusion args[1]"); + status |= clSetKernelArg(dev->diffusion_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old); + status |= clSetKernelArg(dev->diffusion_kernel, 1, sizeof(cl_mem), (void *)&d_conc_new); + report_error(status, "mutable diffusion kernel args"); /* enqueue kernels */ - status = clEnqueueNDRangeKernel(dev->commandQueue, dev->boundary_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL); - report_error(status, "enqueue boundary kernel"); - - status = clEnqueueNDRangeKernel(dev->commandQueue, dev->convolution_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL); - report_error(status, "enqueue convolution kernel"); - - status = clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL); - report_error(status, "enqueue diffusion kernel"); + status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->boundary_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL); + status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->convolution_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL); + status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL); + report_error(status, "enqueue kernels"); } *elapsed += dt * checks;