From 45075008307ce14e6b381ac2196c53a2db574966 Mon Sep 17 00:00:00 2001
From: Trevor Keller <trevor.keller@nist.gov>
Date: Thu, 21 Sep 2017 09:45:31 -0400
Subject: [PATCH] streamlining

---
 common-diffusion/params.txt                  |  6 +-
 gpu-cuda-diffusion/cuda_boundaries.cu        | 28 ++++----
 gpu-opencl-diffusion/Makefile                |  5 +-
 gpu-opencl-diffusion/kernel_boundary.cl      | 35 +++++----
 gpu-opencl-diffusion/kernel_convolution.cl   | 35 +++++----
 gpu-opencl-diffusion/opencl_data.c           |  6 ++
 gpu-opencl-diffusion/opencl_discretization.c | 76 ++++++++------------
 7 files changed, 89 insertions(+), 102 deletions(-)

diff --git a/common-diffusion/params.txt b/common-diffusion/params.txt
index 5bed383..119c5b8 100644
--- a/common-diffusion/params.txt
+++ b/common-diffusion/params.txt
@@ -1,7 +1,9 @@
-nx 512     # mesh points along x-axis
-ny 512     # mesh points along y-axis
+nx 512     # total mesh points along x-axis
+ny 512     # total mesh points along y-axis
 dx 0.5     # mesh resolution along x-axis
 dy 0.5     # mesh resolution along y-axis
+bx 32      # convolution block size along x-axis
+by 32      # convolution block size along y-axis
 ns 100000  # number of timesteps to march
 nc 10000   # number of timesteps between checkpoint outputs
 dc 0.00625 # diffusion coefficient
diff --git a/gpu-cuda-diffusion/cuda_boundaries.cu b/gpu-cuda-diffusion/cuda_boundaries.cu
index 9430d86..e29d913 100644
--- a/gpu-cuda-diffusion/cuda_boundaries.cu
+++ b/gpu-cuda-diffusion/cuda_boundaries.cu
@@ -83,7 +83,7 @@ __global__ void boundary_kernel(fp_t* d_conc,
 
 	/* apply fixed boundary values: sequence does not matter */
 
-	if (row >= 0 && row < ny/2 && col >= 0 && col < 1+nm/2) {
+	if (row < ny/2 && col < 1+nm/2) {
 		d_conc[row * nx + col] = d_bc[1][0]; /* left value */
 	}
 
@@ -99,22 +99,22 @@ __global__ void boundary_kernel(fp_t* d_conc,
 	for (offset = 0; offset < nm/2; offset++) {
 		ilo = nm/2 - offset;
 		ihi = nx - 1 - nm/2 + offset;
-		if (col == ilo-1 && row >= 0 && row < ny) {
-			d_conc[row * nx + col] = d_conc[row * nx + ilo]; /* left condition */
-		} else if (col == ihi+1 && row >= 0 && row < ny) {
-			d_conc[row * nx + col] = d_conc[row * nx + ihi]; /* right condition */
-		}
-		__syncthreads();
-	}
-
-	for (offset = 0; offset < nm/2; offset++) {
 		jlo = nm/2 - offset;
 		jhi = ny - 1 - nm/2 + offset;
-		if (row == jlo-1 && col >= 0 && col < nx) {
-			d_conc[row * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */
-		} else if (row == jhi+1 && col >= 0 && col < nx) {
-			d_conc[row * nx + col] = d_conc[jhi * nx + col]; /* top condition */
+
+		if (ilo-1 == col && row < ny) {
+			d_conc[row * nx + ilo-1] = d_conc[row * nx + ilo]; /* left condition */
+		}
+		if (ihi+1 == col && row < ny) {
+			d_conc[row * nx + ihi+1] = d_conc[row * nx + ihi]; /* right condition */
+		}
+		if (jlo-1 == row && col < nx) {
+			d_conc[(jlo-1) * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */
 		}
+		if (jhi+1 == row && col < nx) {
+			d_conc[(jhi+1) * nx + col] = d_conc[jhi * nx + col]; /* top condition */
+		}
+
 		__syncthreads();
 	}
 }
diff --git a/gpu-opencl-diffusion/Makefile b/gpu-opencl-diffusion/Makefile
index 564dc37..c922473 100644
--- a/gpu-opencl-diffusion/Makefile
+++ b/gpu-opencl-diffusion/Makefile
@@ -2,16 +2,17 @@
 # OpenCL implementation
 
 CC = gcc
-CFLAGS = -g -Wall -pedantic -std=c11 -I../common-diffusion -fopenmp
+CFLAGS = -O3 -Wall -pedantic -std=c11 -I../common-diffusion -fopenmp
 LINKS = -lm -lpng -lOpenCL
 
 LLVM = clang
 LFLAGS = -S -emit-llvm -x cl -I../common-diffusion
 
+KERNELS = kernel_boundary.cl kernel_convolution.cl kernel_diffusion.cl
 OBJS = boundaries.o data.o discretization.o mesh.o numerics.o output.o timer.o
 
 # Executable
-diffusion: opencl_main.c $(OBJS)
+diffusion: opencl_main.c $(KERNELS) $(OBJS)
 	$(CC) $(CFLAGS) $(OBJS) $< -o $@ $(LINKS)
 
 # OpenCL objects
diff --git a/gpu-opencl-diffusion/kernel_boundary.cl b/gpu-opencl-diffusion/kernel_boundary.cl
index 466206e..e584623 100644
--- a/gpu-opencl-diffusion/kernel_boundary.cl
+++ b/gpu-opencl-diffusion/kernel_boundary.cl
@@ -37,23 +37,22 @@ __kernel void boundary_kernel(__global fp_t* d_conc,
                               int ny,
                               int nm)
 {
-	int idx, col, row;
+	int col, row;
 	int ihi, ilo, jhi, jlo, offset;
 
 	/* determine indices on which to operate */
 
 	col = get_global_id(0);
 	row = get_global_id(1);
-	idx = row * nx + col;
 
 	/* apply fixed boundary values: sequence does not matter */
 
-	if (row >= 0 && row < ny/2 && col >= 0 && col < 1+nm/2) {
-		d_conc[idx] = d_bc[2]; /* left value, bc[1][0] = bc[2*1 + 0] */
+	if (row < ny/2 && col < 1+nm/2) {
+		d_conc[row * nx + col] = d_bc[2]; /* left value, bc[1][0] = bc[2*1 + 0] */
 	}
 
 	if (row >= ny/2 && row < ny && col >= nx-1-nm/2 && col < nx) {
-		d_conc[idx] = d_bc[3]; /* right value, bc[1][1] = bc[2*1 + 1] */
+		d_conc[row * nx + col] = d_bc[3]; /* right value, bc[1][1] = bc[2*1 + 1] */
 	}
 
 	/* wait for all threads to finish writing */
@@ -64,22 +63,22 @@ __kernel void boundary_kernel(__global fp_t* d_conc,
 	for (offset = 0; offset < nm/2; offset++) {
 		ilo = nm/2 - offset;
 		ihi = nx - 1 - nm/2 + offset;
-		if (col == ilo-1 && row >= 0 && row < ny) {
-			d_conc[idx] = d_conc[row * nx + ilo]; /* left condition */
-		} else if (col == ihi+1 && row >= 0 && row < ny) {
-			d_conc[idx] = d_conc[row * nx + ihi]; /* right condition */
-		}
-		barrier(CLK_GLOBAL_MEM_FENCE);
-	}
-
-	for (offset = 0; offset < nm/2; offset++) {
 		jlo = nm/2 - offset;
 		jhi = ny - 1 - nm/2 + offset;
-		if (row == jlo-1 && col >= 0 && col < nx) {
-			d_conc[idx] = d_conc[jlo * nx + col]; /* bottom condition */
-		} else if (row == jhi+1 && col >= 0 && col < nx) {
-			d_conc[idx] = d_conc[jhi * nx + col]; /* top condition */
+
+		if (ilo-1 == col && row < ny) {
+			d_conc[row * nx + ilo-1] = d_conc[row * nx + ilo]; /* left condition */
+		}
+		if (ihi+1 == col && row < ny) {
+			d_conc[row * nx + ihi+1] = d_conc[row * nx + ihi]; /* right condition */
+		}
+		if (jlo-1 == row && col < nx) {
+			d_conc[(jlo-1) * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */
 		}
+		if (jhi+1 == row && col < nx) {
+			d_conc[(jhi+1) * nx + col] = d_conc[jhi * nx + col]; /* top condition */
+		}
+
 		barrier(CLK_GLOBAL_MEM_FENCE);
 	}
 }
diff --git a/gpu-opencl-diffusion/kernel_convolution.cl b/gpu-opencl-diffusion/kernel_convolution.cl
index 3e5ca29..1cafd10 100644
--- a/gpu-opencl-diffusion/kernel_convolution.cl
+++ b/gpu-opencl-diffusion/kernel_convolution.cl
@@ -51,38 +51,35 @@ __kernel void convolution_kernel(__global fp_t* d_conc_old,
                                  int ny,
                                  int nm)
 {
-	int i, j, til_col, til_row;
-	int dst_col, dst_idx, dst_row, dst_cols, dst_rows;
-	int src_col, src_idx, src_row;
-	fp_t value=0.;
+	int i, j;
+	int dst_col, dst_cols, dst_row, dst_rows;
+	int src_col, src_cols, src_row, src_rows;
+	int til_col, til_row;
+	fp_t value = 0.;
 
 	/* source tile includes the halo cells, destination tile does not */
-	dst_cols = get_local_size(0) - nm + 1;
-	dst_rows = get_local_size(1) - nm + 1;
+	src_cols = get_local_size(0);
+	src_rows = get_local_size(1);
+
+	dst_cols = src_cols - nm + 1;
+	dst_rows = src_rows - nm + 1;
 
 	/* determine indices on which to operate */
 	til_col = get_local_id(0);
 	til_row = get_local_id(1);
 
-	dst_col = get_group_id(0) * dst_cols + til_col; /* not get_global_id(0); */
-	dst_row = get_group_id(1) * dst_rows + til_row; /* not get_global_id(1); */
+	dst_col = get_group_id(0) * dst_cols + til_col;
+	dst_row = get_group_id(1) * dst_rows + til_row;
 
 	src_col = dst_col - nm/2;
 	src_row = dst_row - nm/2;
 
-	dst_idx = dst_row * nx + dst_col;
-	src_idx = src_row * nx + src_col;
-
 	/* shared memory tile: __local gives access to all threads in the group */
 	__local fp_t d_conc_tile[TILE_H + MAX_MASK_H - 1][TILE_W + MAX_MASK_W - 1];
 
-	if ((src_row >= 0) && (src_row < ny) &&
-	    (src_col >= 0) && (src_col < nx)) {
-		/* if src_row==0, then dst_row==nm/2: this is a halo row */
-		d_conc_tile[til_row][til_col] = d_conc_old[src_idx];
-	} else {
-		/* points outside the halo should be switched off */
-		d_conc_tile[til_row][til_col] = 0.;
+	if (src_row >= 0 && src_row < ny &&
+	    src_col >= 0 && src_col < nx) {
+		d_conc_tile[til_row][til_col] = d_conc_old[src_row * nx + src_col];
 	}
 
 	/* tile data is shared: wait for all threads to finish copying */
@@ -97,7 +94,7 @@ __kernel void convolution_kernel(__global fp_t* d_conc_old,
 		}
 		/* record value */
 		if (dst_row < ny && dst_col < nx) {
-			d_conc_lap[dst_idx] = value;
+			d_conc_lap[dst_row * nx + dst_col] = value;
 		}
 	}
 
diff --git a/gpu-opencl-diffusion/opencl_data.c b/gpu-opencl-diffusion/opencl_data.c
index 860aa7f..2b71153 100644
--- a/gpu-opencl-diffusion/opencl_data.c
+++ b/gpu-opencl-diffusion/opencl_data.c
@@ -228,6 +228,12 @@ void build_program(const char* filename,
 void free_opencl(struct OpenCLData* dev)
 {
 	/* clean up */
+	free(dev->conc_old);
+	free(dev->conc_new);
+	free(dev->conc_lap);
+	free(dev->bc);
+	free(dev->mask);
+
 	clReleaseContext(dev->context);
 
 	clReleaseKernel(dev->boundary_kernel);
diff --git a/gpu-opencl-diffusion/opencl_discretization.c b/gpu-opencl-diffusion/opencl_discretization.c
index 264a080..8c0a728 100644
--- a/gpu-opencl-diffusion/opencl_discretization.c
+++ b/gpu-opencl-diffusion/opencl_discretization.c
@@ -55,38 +55,26 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
 	cl_int status = CL_SUCCESS;
 
 	/* set immutable kernel arguments */
-	status = clSetKernelArg(dev->boundary_kernel, 1, sizeof(cl_mem), (void *)&(dev->bc));
-	report_error(status, "const boundary args[1]");
-	status = clSetKernelArg(dev->boundary_kernel, 2, sizeof(int), (void *)&nx);
-	report_error(status, "const boundary args[2]");
-	status = clSetKernelArg(dev->boundary_kernel, 3, sizeof(int), (void *)&ny);
-	report_error(status, "const boundary args[3]");
-	status = clSetKernelArg(dev->boundary_kernel, 4, sizeof(int), (void *)&nm);
-	report_error(status, "const boundary args[4]");
-
-	status = clSetKernelArg(dev->convolution_kernel, 1, sizeof(cl_mem), (void *)&(dev->conc_lap));
-	report_error(status, "const convolution args[1]");
-	status = clSetKernelArg(dev->convolution_kernel, 2, sizeof(cl_mem), (void *)&(dev->mask));
-	report_error(status, "const convolution args[2]");
-	status = clSetKernelArg(dev->convolution_kernel, 3, sizeof(int), (void *)&nx);
-	report_error(status, "const convolution args[3]");
-	status = clSetKernelArg(dev->convolution_kernel, 4, sizeof(int), (void *)&ny);
-	report_error(status, "const convolution args[4]");
-	status = clSetKernelArg(dev->convolution_kernel, 5, sizeof(int), (void *)&nm);
-	report_error(status, "const convolution args[5]");
-
-	status = clSetKernelArg(dev->diffusion_kernel, 2, sizeof(cl_mem), (void *)&(dev->conc_lap));
-	report_error(status, "const diffusion args[2]");
-	status = clSetKernelArg(dev->diffusion_kernel, 3, sizeof(int), (void *)&nx);
-	report_error(status, "const diffusion args[3]");
-	status = clSetKernelArg(dev->diffusion_kernel, 4, sizeof(int), (void *)&ny);
-	report_error(status, "const diffusion args[4]");
-	status = clSetKernelArg(dev->diffusion_kernel, 5, sizeof(int), (void *)&nm);
-	report_error(status, "const diffusion args[5]");
-	status = clSetKernelArg(dev->diffusion_kernel, 6, sizeof(fp_t), (void *)&D);
-	report_error(status, "const diffusion args[6]");
-	status = clSetKernelArg(dev->diffusion_kernel, 7, sizeof(fp_t), (void *)&dt);
-	report_error(status, "const diffusion args[7]");
+	status |= clSetKernelArg(dev->boundary_kernel, 1, sizeof(cl_mem), (void *)&(dev->bc));
+	status |= clSetKernelArg(dev->boundary_kernel, 2, sizeof(int), (void *)&nx);
+	status |= clSetKernelArg(dev->boundary_kernel, 3, sizeof(int), (void *)&ny);
+	status |= clSetKernelArg(dev->boundary_kernel, 4, sizeof(int), (void *)&nm);
+	report_error(status, "constant boundary kernal args");
+
+	status |= clSetKernelArg(dev->convolution_kernel, 1, sizeof(cl_mem), (void *)&(dev->conc_lap));
+	status |= clSetKernelArg(dev->convolution_kernel, 2, sizeof(cl_mem), (void *)&(dev->mask));
+	status |= clSetKernelArg(dev->convolution_kernel, 3, sizeof(int), (void *)&nx);
+	status |= clSetKernelArg(dev->convolution_kernel, 4, sizeof(int), (void *)&ny);
+	status |= clSetKernelArg(dev->convolution_kernel, 5, sizeof(int), (void *)&nm);
+	report_error(status, "constant convolution kernel args");
+
+	status |= clSetKernelArg(dev->diffusion_kernel, 2, sizeof(cl_mem), (void *)&(dev->conc_lap));
+	status |= clSetKernelArg(dev->diffusion_kernel, 3, sizeof(int), (void *)&nx);
+	status |= clSetKernelArg(dev->diffusion_kernel, 4, sizeof(int), (void *)&ny);
+	status |= clSetKernelArg(dev->diffusion_kernel, 5, sizeof(int), (void *)&nm);
+	status |= clSetKernelArg(dev->diffusion_kernel, 6, sizeof(fp_t), (void *)&D);
+	status |= clSetKernelArg(dev->diffusion_kernel, 7, sizeof(fp_t), (void *)&dt);
+	report_error(status, "constant diffusion kernel args");
 
 	/* OpenCL uses cl_mem, not fp_t*, so swap_pointers won't work.
      * We leave the pointers alone but call the kernel on the appropriate data location.
@@ -103,26 +91,20 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
 
 		/* set time-dependent kernel arguments */
 		status = clSetKernelArg(dev->boundary_kernel,    0, sizeof(cl_mem), (void *)&d_conc_old);
-		report_error(status, "mutable boundary args[0]");
+		report_error(status, "mutable boundary kernel args");
 
 		status = clSetKernelArg(dev->convolution_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old);
-		report_error(status, "mutable convolution args[0]");
+		report_error(status, "mutable convolution kernel args");
 
-		status = clSetKernelArg(dev->diffusion_kernel,   0, sizeof(cl_mem), (void *)&d_conc_old);
-		report_error(status, "mutable diffusion args[0]");
-
-		status = clSetKernelArg(dev->diffusion_kernel,   1, sizeof(cl_mem), (void *)&d_conc_new);
-		report_error(status, "mutable diffusion args[1]");
+		status |= clSetKernelArg(dev->diffusion_kernel,   0, sizeof(cl_mem), (void *)&d_conc_old);
+		status |= clSetKernelArg(dev->diffusion_kernel,   1, sizeof(cl_mem), (void *)&d_conc_new);
+		report_error(status, "mutable diffusion kernel args");
 
 		/* enqueue kernels */
-		status = clEnqueueNDRangeKernel(dev->commandQueue, dev->boundary_kernel,    2, NULL, grid_dim, block_dim, 0, NULL, NULL);
-		report_error(status, "enqueue boundary kernel");
-
-		status = clEnqueueNDRangeKernel(dev->commandQueue, dev->convolution_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL);
-		report_error(status, "enqueue convolution kernel");
-
-		status = clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel,   2, NULL, grid_dim, block_dim, 0, NULL, NULL);
-		report_error(status, "enqueue diffusion kernel");
+		status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->boundary_kernel,    2, NULL, grid_dim, block_dim, 0, NULL, NULL);
+		status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->convolution_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL);
+		status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel,   2, NULL, grid_dim, block_dim, 0, NULL, NULL);
+		report_error(status, "enqueue kernels");
 	}
 
 	*elapsed += dt * checks;