From b90eca05a41e7a6f6b9633773f7e314408137c24 Mon Sep 17 00:00:00 2001
From: Trevor Keller <trevor.keller@nist.gov>
Date: Tue, 10 Oct 2017 12:06:47 -0400
Subject: [PATCH 1/4] add blocksize to params

---
 Makefile                                  |  2 +
 common-diffusion/output.c                 | 69 ++++++++++--------
 common-diffusion/output.h                 |  2 +-
 cpu-analytic-diffusion/analytic_main.c    |  4 +-
 cpu-openmp-diffusion/openmp_main.c        |  4 +-
 cpu-serial-diffusion/serial_main.c        |  4 +-
 cpu-tbb-diffusion/tbb_main.c              |  4 +-
 gpu-cuda-diffusion/cuda_data.h            |  5 +-
 gpu-cuda-diffusion/cuda_discretization.cu | 85 ++++++++++++-----------
 gpu-cuda-diffusion/cuda_kernels.cuh       | 10 ---
 gpu-cuda-diffusion/cuda_main.c            |  6 +-
 gpu-openacc-diffusion/openacc_main.c      |  4 +-
 gpu-opencl-diffusion/opencl_main.c        |  4 +-
 phi-openmp-diffusion/phi_main.c           |  4 +-
 14 files changed, 110 insertions(+), 97 deletions(-)

diff --git a/Makefile b/Makefile
index 466f148..b6abd02 100644
--- a/Makefile
+++ b/Makefile
@@ -12,6 +12,8 @@ gpu_diffusion_list := gpu-cuda-diffusion \
                       gpu-openacc-diffusion \
                       gpu-opencl-diffusion
 
+phi_diffusion_list := phi-openmp-diffusion
+
 .PHONY: run
 run: run_cpu_diffusion run_gpu_diffusion
 
diff --git a/common-diffusion/output.c b/common-diffusion/output.c
index b14cdf5..3f4204f 100644
--- a/common-diffusion/output.c
+++ b/common-diffusion/output.c
@@ -29,12 +29,13 @@
 #include <png.h>
 #include "output.h"
 
-void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, fp_t* dx, fp_t* dy, fp_t* D, fp_t* linStab, int* steps, int* checks)
+/* void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, fp_t* dx, fp_t* dy, fp_t* D, fp_t* linStab, int* steps, int* checks) */
+void param_parser(int argc, char* argv[], int* bx, int* by, int* checks, int* code, fp_t* D, fp_t* dx, fp_t* dy, fp_t* linStab, int* nm, int* nx, int* ny, int* steps)
 {
 	FILE * input;
 	char buffer[256];
 	char* pch;
-	int inx=0, iny=0, idx=0, idy=0, ins=0, inc=0, idc=0, ico=0, isc=0;
+	int ibx=0, iby=0, ico=0, idc=0, idx=0, idy=0, inc=0, ins=0, inx=0, iny=0, isc=0;
 
 	if (argc != 2) {
 		printf("Error: improper arguments supplied.\nUsage: ./%s filename\n", argv[0]);
@@ -53,14 +54,22 @@ void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code,
 			{
 				pch = strtok(buffer, " ");
 
-				if (strcmp(pch, "nx") == 0) {
+				if (strcmp(pch, "bx") == 0) {
 					pch = strtok(NULL, " ");
-					*nx = atoi(pch);
-					inx = 1;
-				} else if (strcmp(pch, "ny") == 0) {
+					*bx = atoi(pch);
+					ibx = 1;
+				} else if (strcmp(pch, "by") == 0) {
 					pch = strtok(NULL, " ");
-					*ny = atoi(pch);
-					iny = 1;
+					*by = atoi(pch);
+					iby = 1;
+				} else if (strcmp(pch, "co") == 0) {
+					pch = strtok(NULL, " ");
+					*linStab = atof(pch);
+					ico = 1;
+				} else if (strcmp(pch, "dc") == 0) {
+					pch = strtok(NULL, " ");
+					*D = atof(pch);
+					idc = 1;
 				} else if (strcmp(pch, "dx") == 0) {
 					pch = strtok(NULL, " ");
 					*dx = atof(pch);
@@ -69,22 +78,22 @@ void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code,
 					pch = strtok(NULL, " ");
 					*dy = atof(pch);
 					idy = 1;
-				} else if (strcmp(pch, "ns") == 0) {
-					pch = strtok(NULL, " ");
-					*steps = atoi(pch);
-					ins = 1;
 				} else if (strcmp(pch, "nc") == 0) {
 					pch = strtok(NULL, " ");
 					*checks = atoi(pch);
 					inc = 1;
-				} else if (strcmp(pch, "dc") == 0) {
+				} else if (strcmp(pch, "ns") == 0) {
 					pch = strtok(NULL, " ");
-					*D = atof(pch);
-					idc = 1;
-				} else if (strcmp(pch, "co") == 0) {
+					*steps = atoi(pch);
+					ins = 1;
+				} else if (strcmp(pch, "nx") == 0) {
 					pch = strtok(NULL, " ");
-					*linStab = atof(pch);
-					ico = 1;
+					*nx = atoi(pch);
+					inx = 1;
+				} else if (strcmp(pch, "ny") == 0) {
+					pch = strtok(NULL, " ");
+					*ny = atoi(pch);
+					iny = 1;
 				} else if (strcmp(pch, "sc") == 0) {
 					pch = strtok(NULL, " ");
 					*nm = atoi(pch);
@@ -98,22 +107,26 @@ void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code,
 		}
 
 		/* make sure we got everyone */
-		if (! inx) {
-			printf("Warning: parameter %s undefined. Using default value, %i.\n", "nx", *nx);
-		} else if (! iny) {
-			printf("Warning: parameter %s undefined. Using default value, %i.\n", "ny", *ny);
+		if (! ibx) {
+			printf("Warning: parameter %s undefined. Using default value, %i.\n", "bx", *bx);
+		} else if (! iby) {
+			printf("Warning: parameter %s undefined. Using default value, %i.\n", "by", *by);
+		} else if (! ico) {
+			printf("Warning: parameter %s undefined. Using default value, %f.\n", "co", *linStab);
+		} else if (! idc) {
+			printf("Warning: parameter %s undefined. Using default value, %f.\n", "dc", *D);
 		} else if (! idx) {
 			printf("Warning: parameter %s undefined. Using default value, %f.\n", "dx", *dx);
 		} else if (! idy) {
 			printf("Warning: parameter %s undefined. Using default value, %f.\n", "dy", *dy);
-		} else if (! ins) {
-			printf("Warning: parameter %s undefined. Using default value, %i.\n", "ns", *steps);
 		} else if (! inc) {
 			printf("Warning: parameter %s undefined. Using default value, %i.\n", "nc", *checks);
-		} else if (! idc) {
-			printf("Warning: parameter %s undefined. Using default value, %f.\n", "dc", *D);
-		} else if (! ico) {
-			printf("Warning: parameter %s undefined. Using default value, %f.\n", "co", *linStab);
+		} else if (! ins) {
+			printf("Warning: parameter %s undefined. Using default value, %i.\n", "ns", *steps);
+		} else if (! inx) {
+			printf("Warning: parameter %s undefined. Using default value, %i.\n", "nx", *nx);
+		} else if (! iny) {
+			printf("Warning: parameter %s undefined. Using default value, %i.\n", "ny", *ny);
 		} else if (! isc) {
 			printf("Warning: parameter %s undefined. Using default values, %i and %i.\n", "sc", *nm, *code);
 		}
diff --git a/common-diffusion/output.h b/common-diffusion/output.h
index 38c171a..9643390 100644
--- a/common-diffusion/output.h
+++ b/common-diffusion/output.h
@@ -32,7 +32,7 @@
 /**
  \brief Read parameters from file specified on the command line
 */
-void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, fp_t* dx, fp_t* dy, fp_t* D, fp_t* linStab, int* steps, int* checks);
+void param_parser(int argc, char* argv[], int* bx, int* by, int* checks, int* code, fp_t* D, fp_t* dx, fp_t* dy, fp_t* linStab, int* nm, int* nx, int* ny, int* steps);
 
 /**
  \brief Prints timestamps and a 20-point progress bar to stdout
diff --git a/cpu-analytic-diffusion/analytic_main.c b/cpu-analytic-diffusion/analytic_main.c
index 3f4e8a9..eb8969e 100644
--- a/cpu-analytic-diffusion/analytic_main.c
+++ b/cpu-analytic-diffusion/analytic_main.c
@@ -72,7 +72,7 @@ int main(int argc, char* argv[])
 
 	/* declare default mesh size and resolution */
 	fp_t **conc_old, **conc_new, **conc_lap, **mask_lap;
-	int nx=512, ny=512, nm=3, code=53;
+	int bx=32, by=32, nx=512, ny=512, nm=3, code=53;
 	fp_t dx=0.5, dy=0.5, h=0.5;
 
 	/* declare default materials and numerical parameters */
@@ -82,7 +82,7 @@ int main(int argc, char* argv[])
 
 	StartTimer();
 
-	param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks);
+	param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps);
 
 	h = (dx > dy) ? dy : dx;
 	dt = (linStab * h * h) / (4.0 * D);
diff --git a/cpu-openmp-diffusion/openmp_main.c b/cpu-openmp-diffusion/openmp_main.c
index d3a284a..4b4a5d1 100644
--- a/cpu-openmp-diffusion/openmp_main.c
+++ b/cpu-openmp-diffusion/openmp_main.c
@@ -43,7 +43,7 @@ int main(int argc, char* argv[])
 
 	/* declare default mesh size and resolution */
 	fp_t **conc_old, **conc_new, **conc_lap, **mask_lap;
-	int nx=512, ny=512, nm=3, code=53;
+	int bx=32, by=32, nx=512, ny=512, nm=3, code=53;
 	fp_t dx=0.5, dy=0.5, h=0.5;
 	fp_t bc[2][2];
 
@@ -55,7 +55,7 @@ int main(int argc, char* argv[])
 
 	StartTimer();
 
-	param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks);
+	param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps);
 
 	h = (dx > dy) ? dy : dx;
 	dt = (linStab * h * h) / (4.0 * D);
diff --git a/cpu-serial-diffusion/serial_main.c b/cpu-serial-diffusion/serial_main.c
index c0a705f..da8a752 100644
--- a/cpu-serial-diffusion/serial_main.c
+++ b/cpu-serial-diffusion/serial_main.c
@@ -51,7 +51,7 @@ int main(int argc, char* argv[])
 
 	/* declare default mesh size and resolution */
 	fp_t **conc_old, **conc_new, **conc_lap, **mask_lap;
-	int nx=512, ny=512, nm=3, code=53;
+	int bx=32, by=32, nx=512, ny=512, nm=3, code=53;
 	fp_t dx=0.5, dy=0.5, h=0.5;
 	fp_t bc[2][2];
 
@@ -63,7 +63,7 @@ int main(int argc, char* argv[])
 
 	StartTimer();
 
-	param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks);
+	param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps);
 
 	h = (dx > dy) ? dy : dx;
 	dt = (linStab * h * h) / (4.0 * D);
diff --git a/cpu-tbb-diffusion/tbb_main.c b/cpu-tbb-diffusion/tbb_main.c
index 33539c0..10bc7b3 100644
--- a/cpu-tbb-diffusion/tbb_main.c
+++ b/cpu-tbb-diffusion/tbb_main.c
@@ -43,7 +43,7 @@ int main(int argc, char* argv[])
 
 	/* declare default mesh size and resolution */
 	fp_t **conc_old, **conc_new, **conc_lap, **mask_lap;
-	int nx=512, ny=512, nm=3, code=53;
+	int bx=32, by=32, nx=512, ny=512, nm=3, code=53;
 	fp_t dx=0.5, dy=0.5, h=0.5;
 	fp_t bc[2][2];
 
@@ -55,7 +55,7 @@ int main(int argc, char* argv[])
 
 	StartTimer();
 
-	param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks);
+	param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps);
 
 	h = (dx > dy) ? dy : dx;
 	dt = (linStab * h * h) / (4.0 * D);
diff --git a/gpu-cuda-diffusion/cuda_data.h b/gpu-cuda-diffusion/cuda_data.h
index 4f2159f..475c098 100644
--- a/gpu-cuda-diffusion/cuda_data.h
+++ b/gpu-cuda-diffusion/cuda_data.h
@@ -48,8 +48,9 @@ void init_cuda(fp_t** conc_old, fp_t** mask_lap, fp_t bc[2][2],
  \brief Specialization of solve_diffusion_equation() using CUDA
 */
 void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new,
-                           int nx, int ny, int nm,
-                           fp_t bc[2][2], fp_t D, fp_t dt, int checks,
+                           fp_t bc[2][2], int bx, int by,
+                           int nm, int nx, int ny,
+                           fp_t D, fp_t dt, int checks,
                            fp_t *elapsed, struct Stopwatch* sw);
 
 /**
diff --git a/gpu-cuda-diffusion/cuda_discretization.cu b/gpu-cuda-diffusion/cuda_discretization.cu
index 36651ca..c04dd94 100644
--- a/gpu-cuda-diffusion/cuda_discretization.cu
+++ b/gpu-cuda-diffusion/cuda_discretization.cu
@@ -46,53 +46,56 @@ __global__ void convolution_kernel(fp_t* d_conc_old,
                                    const int ny,
                                    const int nm)
 {
-	int i, j, tx, ty;
-	int dst_row, dst_col, dst_cols, dst_rows;
-	int src_row, src_col, src_cols, src_rows;
+	int i, j, y;
+	int dst_til_y, dst_til_x, dst_til_nx, dst_til_ny;
+	int src_til_y, src_til_x, src_til_nx, src_til_ny;
+	int sha_til_x, sha_til_y, sha_til_nx;
 	fp_t value=0.;
 
-	/* source tile width includes the halo cells */
-	src_cols = blockDim.x;
-	src_rows = blockDim.y;
+	/* source and shared tile width include the halo cells */
+	src_til_nx = blockDim.x;
+	src_til_ny = blockDim.y;
+	sha_til_nx = src_til_nx;
 
 	/* destination tile width excludes the halo cells */
-	dst_cols = src_cols - nm + 1;
-	dst_rows = src_rows - nm + 1;
+	dst_til_nx = src_til_nx - nm + 1;
+	dst_til_ny = src_til_ny - nm + 1;
 
 	/* determine indices on which to operate */
-	tx = threadIdx.x;
-	ty = threadIdx.y;
+	sha_til_x = threadIdx.x;
+	sha_til_y = threadIdx.y;
 
-	dst_col = blockIdx.x * dst_cols + tx;
-	dst_row = blockIdx.y * dst_rows + ty;
+	dst_til_x = blockIdx.x * dst_til_nx + sha_til_x;
+	dst_til_y = blockIdx.y * dst_til_ny + sha_til_y;
 
-	src_col = dst_col - nm/2;
-	src_row = dst_row - nm/2;
+	src_til_x = dst_til_x - nm/2;
+	src_til_y = dst_til_y - nm/2;
 
 	/* copy tile: __shared__ gives access to all threads working on this tile */
-	__shared__ fp_t d_conc_tile[TILE_H + MAX_MASK_H - 1][TILE_W + MAX_MASK_W - 1];
+	extern __shared__ fp_t d_conc_tile[];
 
-	if (src_row >= 0 && src_row < ny &&
-	    src_col >= 0 && src_col < nx) {
-		/* if src_row==0, then dst_row==nm/2: this is a halo row */
-		d_conc_tile[ty][tx] = d_conc_old[src_row * nx + src_col];
+	if (src_til_y >= 0 && src_til_y < ny &&
+	    src_til_x >= 0 && src_til_x < nx) {
+		/* if src_til_y==0, then dst_til_y==nm/2: this is a halo row */
+		d_conc_tile[sha_til_y * sha_til_nx + sha_til_x] = d_conc_old[src_til_y * nx + src_til_x];
 	} else {
-		d_conc_tile[ty][tx] = 0.;
+		d_conc_tile[sha_til_y * sha_til_nx + sha_til_x] = 0.;
 	}
 
 	/* tile data is shared: wait for all threads to finish copying */
 	__syncthreads();
 
 	/* compute the convolution */
-	if (tx < dst_cols && ty < dst_rows) {
+	if (sha_til_x < dst_til_nx && sha_til_y < dst_til_ny) {
 		for (j = 0; j < nm; j++) {
+			y = (j+sha_til_y) * sha_til_nx;
 			for (i = 0; i < nm; i++) {
-				value += d_mask[j * nm + i] * d_conc_tile[j+ty][i+tx];
+				value += d_mask[j * nm + i] * d_conc_tile[y + i + sha_til_x];
 			}
 		}
 		/* record value */
-		if (dst_row < ny && dst_col < nx) {
-			d_conc_lap[dst_row * nx + dst_col] = value;
+		if (dst_til_y < ny && dst_til_x < nx) {
+			d_conc_lap[dst_til_y * nx + dst_til_x] = value;
 		}
 	}
 
@@ -109,19 +112,19 @@ __global__ void diffusion_kernel(fp_t* d_conc_old,
                                  const fp_t D,
                                  const fp_t dt)
 {
-	int tx, ty, row, col;
+	int thr_x, thr_y, x, y;
 
 	/* determine indices on which to operate */
-	tx = threadIdx.x;
-	ty = threadIdx.y;
+	thr_x = threadIdx.x;
+	thr_y = threadIdx.y;
 
-	col = blockDim.x * blockIdx.x + tx;
-	row = blockDim.y * blockIdx.y + ty;
+	x = blockDim.x * blockIdx.x + thr_x;
+	y = blockDim.y * blockIdx.y + thr_y;
 
 	/* explicit Euler solution to the equation of motion */
-	if (row < ny && col < nx) {
-		d_conc_new[row * nx + col] = d_conc_old[row * nx + col]
-		                  + dt * D * d_conc_lap[row * nx + col];
+	if (x < nx && y < ny) {
+		d_conc_new[y * nx + x] = d_conc_old[y * nx + x]
+		                  + dt * D * d_conc_lap[y * nx + x];
 	}
 
 	/* wait for all threads to finish writing */
@@ -129,7 +132,8 @@ __global__ void diffusion_kernel(fp_t* d_conc_old,
 }
 
 void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new,
-                           int nx, int ny, int nm, fp_t bc[2][2],
+                           fp_t bc[2][2], int bx, int by,
+                           int nm, int nx, int ny,
                            fp_t D, fp_t dt, int checks,
                            fp_t* elapsed, struct Stopwatch* sw)
 {
@@ -138,9 +142,9 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new,
 
 	/* divide matrices into blocks of (TILE_W x TILE_H) threads */
 
-	/* extremely precarious */
-	dim3 tile_dim(TILE_W - nm + 1, TILE_H - nm + 1, 1);
+	dim3 tile_dim(bx - nm + 1, by - nm + 1, 1);
 	dim3 bloc_dim(nm - 1 + floor(float(nx)/tile_dim.x), nm - 1 + floor(float(ny)/tile_dim.y), 1);
+	size_t tile_size = bx * by;
 
 	for (check = 0; check < checks; check++) {
 		/* apply boundary conditions */
@@ -150,7 +154,7 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new,
 
 		/* compute Laplacian */
 		start_time = GetTimer();
-		convolution_kernel<<<bloc_dim,tile_dim>>> (
+		convolution_kernel<<<bloc_dim,tile_dim,tile_size>>> (
 			dev->conc_old, dev->conc_lap, nx, ny, nm
 		);
 		sw->conv += GetTimer() - start_time;
@@ -224,7 +228,9 @@ void check_solution(fp_t** conc_new, fp_t** conc_lap, int nx, int ny,
 }
 
 void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap,
-                         int nx, int ny, int nm)
+                         int bx, int by,
+                         int nm,
+                         int nx, int ny)
 {
 	/* If you must compute the convolution separately, do so here.
 	 * It is strongly recommended that you roll CUDA tasks into one function:
@@ -240,8 +246,9 @@ void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap,
 	/* divide matrices into blocks of (TILE_W x TILE_W) threads */
 	/* dim3 tile_dim(TILE_W - nm/2, TILE_H - nm/2, 1); */
 	/* dim3 bloc_dim(ceil(fp_t(nx)/tile_dim.x)+1, ceil(fp_t(ny)/tile_dim.y)+1, 1); */
-	dim3 tile_dim(TILE_W - nm +1, TILE_H - nm + 1, 1);
+	dim3 tile_dim(bx - nm +1, by - nm + 1, 1);
 	dim3 bloc_dim(nm - 1 + floor(float(nx)/tile_dim.x), nm - 1 + floor(float(ny)/tile_dim.y), 1);
+	size_t tile_size = bx * by;
 
 	/* transfer mask in to constant device memory */
 	cudaMemcpyToSymbol(d_mask, mask_lap[0], nm * nm * sizeof(fp_t));
@@ -251,7 +258,7 @@ void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap,
 	           cudaMemcpyHostToDevice);
 
 	/* compute Laplacian */
-	convolution_kernel<<<bloc_dim,tile_dim>>> (
+	convolution_kernel<<<bloc_dim,tile_dim,tile_size>>> (
 		d_conc_old, d_conc_lap, nx, ny, nm
 	);
 
diff --git a/gpu-cuda-diffusion/cuda_kernels.cuh b/gpu-cuda-diffusion/cuda_kernels.cuh
index a1539f1..7f07e55 100644
--- a/gpu-cuda-diffusion/cuda_kernels.cuh
+++ b/gpu-cuda-diffusion/cuda_kernels.cuh
@@ -31,16 +31,6 @@ extern "C" {
 #include "numerics.h"
 }
 
-/**
- \brief Width of an input tile, including halo cells, for GPU memory allocation
-*/
-#define TILE_W 32
-
-/**
- \brief Height of an input tile, including halo cells, for GPU memory allocation
-*/
-#define TILE_H 32
-
 /**
  \brief Convolution mask array on the GPU, allocated in protected memory
 */
diff --git a/gpu-cuda-diffusion/cuda_main.c b/gpu-cuda-diffusion/cuda_main.c
index 77304ef..a833eee 100644
--- a/gpu-cuda-diffusion/cuda_main.c
+++ b/gpu-cuda-diffusion/cuda_main.c
@@ -48,7 +48,7 @@ int main(int argc, char* argv[])
 
 	/* declare default mesh size and resolution */
 	fp_t **conc_old, **conc_new, **conc_lap, **mask_lap;
-	int nx=512, ny=512, nm=3, code=53;
+	int bx=32, by=32, nx=512, ny=512, nm=3, code=53;
 	fp_t dx=0.5, dy=0.5, h=0.5;
 	fp_t bc[2][2];
 
@@ -60,7 +60,7 @@ int main(int argc, char* argv[])
 
 	StartTimer();
 
-	param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks);
+	param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps);
 
 	h = (dx > dy) ? dy : dx;
 	dt = (linStab * h * h) / (4.0 * D);
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
 
 		assert(step + checks <= steps);
 
-		cuda_diffusion_solver(&dev, conc_new, nx, ny, nm, bc, D, dt, checks, &elapsed, &sw);
+		cuda_diffusion_solver(&dev, conc_new, bc, bx, by, nm, nx, ny, D, dt, checks, &elapsed, &sw);
 
 		for (i = 0; i < checks; i++) {
 			step++;
diff --git a/gpu-openacc-diffusion/openacc_main.c b/gpu-openacc-diffusion/openacc_main.c
index 8de3269..7122980 100644
--- a/gpu-openacc-diffusion/openacc_main.c
+++ b/gpu-openacc-diffusion/openacc_main.c
@@ -51,7 +51,7 @@ int main(int argc, char* argv[])
 
 	/* declare default mesh size and resolution */
 	fp_t **conc_old, **conc_new, **conc_lap, **mask_lap;
-	int nx=512, ny=512, nm=3, code=53;
+	int bx=32, by=32, nx=512, ny=512, nm=3, code=53;
 	fp_t dx=0.5, dy=0.5, h=0.5;
 	fp_t bc[2][2];
 
@@ -63,7 +63,7 @@ int main(int argc, char* argv[])
 
 	StartTimer();
 
-	param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks);
+	param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps);
 
 	h = (dx > dy) ? dy : dx;
 	dt = (linStab * h * h) / (4.0 * D);
diff --git a/gpu-opencl-diffusion/opencl_main.c b/gpu-opencl-diffusion/opencl_main.c
index 06131af..245fa8f 100644
--- a/gpu-opencl-diffusion/opencl_main.c
+++ b/gpu-opencl-diffusion/opencl_main.c
@@ -49,7 +49,7 @@ int main(int argc, char* argv[])
 
 	/* declare default mesh size and resolution */
 	fp_t **conc_old, **conc_new, **conc_lap, **mask_lap;
-	int nx=512, ny=512, nm=3, code=53;
+	int bx=32, by=32, nx=512, ny=512, nm=3, code=53;
 	fp_t dx=0.5, dy=0.5, h=0.5;
 	fp_t bc[2][2];
 
@@ -61,7 +61,7 @@ int main(int argc, char* argv[])
 
 	StartTimer();
 
-	param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks);
+	param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps);
 
 	h = (dx > dy) ? dy : dx;
 	dt = (linStab * h * h) / (4.0 * D);
diff --git a/phi-openmp-diffusion/phi_main.c b/phi-openmp-diffusion/phi_main.c
index 1835ef8..fd99a42 100644
--- a/phi-openmp-diffusion/phi_main.c
+++ b/phi-openmp-diffusion/phi_main.c
@@ -43,7 +43,7 @@ int main(int argc, char* argv[])
 
 	/* declare default mesh size and resolution */
 	fp_t **conc_old, **conc_new, **conc_lap, **mask_lap;
-	int nx=512, ny=512, nm=3, code=53;
+	int bx=32, by=32, nx=512, ny=512, nm=3, code=53;
 	fp_t dx=0.5, dy=0.5, h=0.5;
 	fp_t bc[2][2];
 
@@ -55,7 +55,7 @@ int main(int argc, char* argv[])
 
 	StartTimer();
 
-	param_parser(argc, argv, &nx, &ny, &nm, &code, &dx, &dy, &D, &linStab, &steps, &checks);
+	param_parser(argc, argv, &bx, &by, &checks, &code, &D, &dx, &dy, &linStab, &nm, &nx, &ny, &steps);
 
 	h = (dx > dy) ? dy : dx;
 	dt = (linStab * h * h) / (4.0 * D);

From 70437d2c202acc2441c0e9d5fe29b2e8cff807e0 Mon Sep 17 00:00:00 2001
From: Trevor Keller <trevor.keller@nist.gov>
Date: Mon, 16 Oct 2017 17:58:44 -0400
Subject: [PATCH 2/4] Fix root cause of tiled algorithm bugs (block size spec)

Closes #96, closes #118.
---
 common-diffusion/timer.c                     |  3 ++
 gpu-cuda-diffusion/cuda_discretization.cu    | 36 ++++++++++++--------
 gpu-opencl-diffusion/opencl_discretization.c | 18 +++++-----
 3 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/common-diffusion/timer.c b/common-diffusion/timer.c
index 978af5d..1253307 100644
--- a/common-diffusion/timer.c
+++ b/common-diffusion/timer.c
@@ -36,6 +36,9 @@
 	#ifndef __USE_BSD
 		#define __USE_BSD
 	#endif
+	#ifndef __USE_MISC
+		#define __USE_MISC
+	#endif
 	#include <sys/time.h>
 #endif
 
diff --git a/gpu-cuda-diffusion/cuda_discretization.cu b/gpu-cuda-diffusion/cuda_discretization.cu
index 36651ca..8221575 100644
--- a/gpu-cuda-diffusion/cuda_discretization.cu
+++ b/gpu-cuda-diffusion/cuda_discretization.cu
@@ -136,37 +136,40 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new,
 	double start_time;
 	int check=0;
 
-	/* divide matrices into blocks of (TILE_W x TILE_H) threads */
-
-	/* extremely precarious */
-	dim3 tile_dim(TILE_W - nm + 1, TILE_H - nm + 1, 1);
-	dim3 bloc_dim(nm - 1 + floor(float(nx)/tile_dim.x), nm - 1 + floor(float(ny)/tile_dim.y), 1);
+	/* divide matrices into blocks of TILE_W *TILE_H threads */
+	dim3 tile_size(TILE_W,
+	               TILE_H,
+	               1);
+	dim3 num_tiles(ceil(float(nx) / (tile_size.x - nm + 1)),
+	               ceil(float(ny) / (tile_size.y - nm + 1)),
+	               1);
 
 	for (check = 0; check < checks; check++) {
 		/* apply boundary conditions */
-		boundary_kernel<<<bloc_dim,tile_dim>>> (
+		boundary_kernel<<<num_tiles,tile_size>>> (
 			dev->conc_old, nx, ny, nm
 		);
 
 		/* compute Laplacian */
 		start_time = GetTimer();
-		convolution_kernel<<<bloc_dim,tile_dim>>> (
+		convolution_kernel<<<num_tiles,tile_size>>> (
 			dev->conc_old, dev->conc_lap, nx, ny, nm
 		);
 		sw->conv += GetTimer() - start_time;
 
 		/* compute result */
 		start_time = GetTimer();
-		diffusion_kernel<<<bloc_dim,tile_dim>>> (
+		diffusion_kernel<<<num_tiles,tile_size>>> (
 			dev->conc_old, dev->conc_new, dev->conc_lap, nx, ny, nm, D, dt
 		);
 		sw->step += GetTimer() - start_time;
 
 		swap_pointers_1D(&(dev->conc_old), &(dev->conc_new));
+
+		*elapsed += dt;
 	}
 	/* after swap, new data is in dev->conc_old */
 
-	*elapsed += dt * checks;
 
 	/* transfer from device out to host (conc_new) */
 	start_time = GetTimer();
@@ -232,16 +235,19 @@ void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap,
 	 */
 
 	fp_t* d_conc_old, *d_conc_lap;
+	const int margin = 0; /* nm - 1; */
 
 	/* allocate memory on device */
 	cudaMalloc((void **) &d_conc_old, nx * ny * sizeof(fp_t));
 	cudaMalloc((void **) &d_conc_lap, nx * ny * sizeof(fp_t));
 
-	/* divide matrices into blocks of (TILE_W x TILE_W) threads */
-	/* dim3 tile_dim(TILE_W - nm/2, TILE_H - nm/2, 1); */
-	/* dim3 bloc_dim(ceil(fp_t(nx)/tile_dim.x)+1, ceil(fp_t(ny)/tile_dim.y)+1, 1); */
-	dim3 tile_dim(TILE_W - nm +1, TILE_H - nm + 1, 1);
-	dim3 bloc_dim(nm - 1 + floor(float(nx)/tile_dim.x), nm - 1 + floor(float(ny)/tile_dim.y), 1);
+	/* divide matrices into blocks of TILE_W * TILE_H threads */
+	dim3 tile_size(TILE_W,
+	               TILE_H,
+	               1);
+	dim3 num_tiles(ceil(float(nx) / (tile_size.x - nm + 1)),
+	               ceil(float(ny) / (tile_size.y - nm + 1)),
+	               1);
 
 	/* transfer mask in to constant device memory */
 	cudaMemcpyToSymbol(d_mask, mask_lap[0], nm * nm * sizeof(fp_t));
@@ -251,7 +257,7 @@ void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap,
 	           cudaMemcpyHostToDevice);
 
 	/* compute Laplacian */
-	convolution_kernel<<<bloc_dim,tile_dim>>> (
+	convolution_kernel<<<num_tiles,tile_size>>> (
 		d_conc_old, d_conc_lap, nx, ny, nm
 	);
 
diff --git a/gpu-opencl-diffusion/opencl_discretization.c b/gpu-opencl-diffusion/opencl_discretization.c
index 6a40a3b..d9cdf65 100644
--- a/gpu-opencl-diffusion/opencl_discretization.c
+++ b/gpu-opencl-diffusion/opencl_discretization.c
@@ -46,15 +46,12 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
 	 power of two: 4, 8, 16, 32, etc. OpenCL will make a best-guess optimal
 	 block size if you set size_t* tile_dim = NULL.
 	*/
-	/*
-	size_t grid_dim[2] = {(size_t)nx, (size_t)ny};
-	size_t tile_dim[2] = {TILE_W, TILE_H};
-	*/
-
-	/* extremely precarious */
-	size_t tile_dim[2] = {TILE_W - nm + 1, TILE_H - nm + 1};
-	size_t bloc_dim[2] = {nm - 1 + floor((float)(nx)/tile_dim[0]), nm - 1 + floor((float)(ny)/tile_dim[1])};
-	size_t grid_dim[2] = {bloc_dim[0]*tile_dim[0], bloc_dim[1]*tile_dim[1]};
+	size_t tile_dim[2] = {TILE_W,
+	                      TILE_H};
+	size_t bloc_dim[2] = {ceil((float)(nx) / (tile_dim[0] - nm + 1)),
+	                      ceil((float)(ny) / (tile_dim[1] - nm + 1))};
+	size_t grid_dim[2] = {bloc_dim[0] * tile_dim[0],
+	                      bloc_dim[1] * tile_dim[1]};
 
 	cl_mem d_conc_old = dev->conc_old;
 	cl_mem d_conc_new = dev->conc_new;
@@ -112,9 +109,10 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
 		status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->convolution_kernel, 2, NULL, grid_dim, tile_dim, 0, NULL, NULL);
 		status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel,   2, NULL, grid_dim, tile_dim, 0, NULL, NULL);
 		report_error(status, "enqueue kernels");
+
+		*elapsed += dt;
 	}
 
-	*elapsed += dt * checks;
 
 	/* transfer from device out to host */
 	start_time = GetTimer();

From b34632b5d560160f964a114b3781abc42f3307b8 Mon Sep 17 00:00:00 2001
From: Trevor Keller <trevor.keller@nist.gov>
Date: Tue, 17 Oct 2017 14:39:49 -0400
Subject: [PATCH 3/4] physical variable names, dynamic tiles in GPU convolution
 kernels

Closes #113, closes #114
---
 common-diffusion/output.c                    |   1 -
 gpu-cuda-diffusion/cuda_discretization.cu    | 179 ++++++++++---------
 gpu-opencl-diffusion/kernel_boundary.cl      |  32 ++--
 gpu-opencl-diffusion/kernel_convolution.cl   |  51 +++---
 gpu-opencl-diffusion/kernel_diffusion.cl     |   2 +-
 gpu-opencl-diffusion/opencl_boundaries.c     |   4 +-
 gpu-opencl-diffusion/opencl_data.c           |   3 +-
 gpu-opencl-diffusion/opencl_data.h           |   2 +-
 gpu-opencl-diffusion/opencl_discretization.c |  34 ++--
 gpu-opencl-diffusion/opencl_kernels.h        |  44 -----
 gpu-opencl-diffusion/opencl_main.c           |   2 +-
 11 files changed, 164 insertions(+), 190 deletions(-)
 delete mode 100644 gpu-opencl-diffusion/opencl_kernels.h

diff --git a/common-diffusion/output.c b/common-diffusion/output.c
index 3f4204f..f305f74 100644
--- a/common-diffusion/output.c
+++ b/common-diffusion/output.c
@@ -29,7 +29,6 @@
 #include <png.h>
 #include "output.h"
 
-/* void param_parser(int argc, char* argv[], int* nx, int* ny, int* nm, int* code, fp_t* dx, fp_t* dy, fp_t* D, fp_t* linStab, int* steps, int* checks) */
 void param_parser(int argc, char* argv[], int* bx, int* by, int* checks, int* code, fp_t* D, fp_t* dx, fp_t* dy, fp_t* linStab, int* nm, int* nx, int* ny, int* steps)
 {
 	FILE * input;
diff --git a/gpu-cuda-diffusion/cuda_discretization.cu b/gpu-cuda-diffusion/cuda_discretization.cu
index b5bf596..3a68476 100644
--- a/gpu-cuda-diffusion/cuda_discretization.cu
+++ b/gpu-cuda-diffusion/cuda_discretization.cu
@@ -46,56 +46,53 @@ __global__ void convolution_kernel(fp_t* d_conc_old,
                                    const int ny,
                                    const int nm)
 {
-	int i, j, y;
-	int dst_til_y, dst_til_x, dst_til_nx, dst_til_ny;
-	int src_til_y, src_til_x, src_til_nx, src_til_ny;
-	int sha_til_x, sha_til_y, sha_til_nx;
+	int i, j;
+	int dst_x, dst_y, dst_nx, dst_ny;
+	int src_x, src_y, src_nx, src_ny;
+	int til_x, til_y, til_nx;
 	fp_t value=0.;
 
-	/* source and shared tile width include the halo cells */
-	src_til_nx = blockDim.x;
-	src_til_ny = blockDim.y;
-	sha_til_nx = src_til_nx;
+	/* source and tile width include the halo cells */
+	src_nx = blockDim.x;
+	src_ny = blockDim.y;
+	til_nx = src_nx;
 
-	/* destination tile width excludes the halo cells */
-	dst_til_nx = src_til_nx - nm + 1;
-	dst_til_ny = src_til_ny - nm + 1;
+	/* destination width excludes the halo cells */
+	dst_nx = src_nx - nm + 1;
+	dst_ny = src_ny - nm + 1;
 
-	/* determine indices on which to operate */
-	sha_til_x = threadIdx.x;
-	sha_til_y = threadIdx.y;
+	/* determine tile indices on which to operate */
+	til_x = threadIdx.x;
+	til_y = threadIdx.y;
 
-	dst_til_x = blockIdx.x * dst_til_nx + sha_til_x;
-	dst_til_y = blockIdx.y * dst_til_ny + sha_til_y;
+	dst_x = blockIdx.x * dst_nx + til_x;
+	dst_y = blockIdx.y * dst_ny + til_y;
 
-	src_til_x = dst_til_x - nm/2;
-	src_til_y = dst_til_y - nm/2;
+	src_x = dst_x - nm/2;
+	src_y = dst_y - nm/2;
 
 	/* copy tile: __shared__ gives access to all threads working on this tile */
 	extern __shared__ fp_t d_conc_tile[];
 
-	if (src_til_y >= 0 && src_til_y < ny &&
-	    src_til_x >= 0 && src_til_x < nx) {
-		/* if src_til_y==0, then dst_til_y==nm/2: this is a halo row */
-		d_conc_tile[sha_til_y * sha_til_nx + sha_til_x] = d_conc_old[src_til_y * nx + src_til_x];
-	} else {
-		d_conc_tile[sha_til_y * sha_til_nx + sha_til_x] = 0.;
+	if (src_x >= 0 && src_x < nx &&
+	    src_y >= 0 && src_y < ny ) {
+		/* if src_y==0, then dst_y==nm/2: this is a halo row */
+		d_conc_tile[til_nx * til_y + til_x] = d_conc_old[nx * src_y + src_x];
 	}
 
 	/* tile data is shared: wait for all threads to finish copying */
 	__syncthreads();
 
 	/* compute the convolution */
-	if (sha_til_x < dst_til_nx && sha_til_y < dst_til_ny) {
+	if (til_x < dst_nx && til_y < dst_ny) {
 		for (j = 0; j < nm; j++) {
-			y = (j+sha_til_y) * sha_til_nx;
 			for (i = 0; i < nm; i++) {
-				value += d_mask[j * nm + i] * d_conc_tile[y + i + sha_til_x];
+				value += d_mask[j * nm + i] * d_conc_tile[til_nx * (til_y+j) + til_x+i];
 			}
 		}
 		/* record value */
-		if (dst_til_y < ny && dst_til_x < nx) {
-			d_conc_lap[dst_til_y * nx + dst_til_x] = value;
+		if (dst_y < ny && dst_x < nx) {
+			d_conc_lap[nx * dst_y + dst_x] = value;
 		}
 	}
 
@@ -103,6 +100,56 @@ __global__ void convolution_kernel(fp_t* d_conc_old,
 	__syncthreads();
 }
 
+/**
+ \brief Reference showing how to invoke the convolution kernel.
+
+ A stand-alone function like this incurs the cost of host-to-device data
+ transfer each time it is called: it is a teaching tool, not reusable code.
+ It is the basis for cuda_diffusion_solver(), which achieves much better
+ performance by bundling CUDA kernels together and intelligently managing
+ data transfers between the host (CPU) and device (GPU).
+*/
+void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap,
+                         int bx, int by,
+                         int nm,
+                         int nx, int ny)
+{
+	fp_t* d_conc_old, *d_conc_lap;
+
+	/* allocate memory on device */
+	cudaMalloc((void **) &d_conc_old, nx * ny * sizeof(fp_t));
+	cudaMalloc((void **) &d_conc_lap, nx * ny * sizeof(fp_t));
+
+	/* divide matrices into blocks of TILE_W * TILE_H threads */
+	dim3 tile_size(bx,
+	               by,
+	               1);
+	dim3 num_tiles(ceil(float(nx) / (tile_size.x - nm + 1)),
+	               ceil(float(ny) / (tile_size.y - nm + 1)),
+	               1);
+	size_t buf_size = (tile_size.x + nm) * (tile_size.x + nm) * sizeof(fp_t);
+
+	/* transfer mask in to constant device memory */
+	cudaMemcpyToSymbol(d_mask, mask_lap[0], nm * nm * sizeof(fp_t));
+
+	/* transfer data from host in to device */
+	cudaMemcpy(d_conc_old, conc_old[0], nx * ny * sizeof(fp_t),
+	           cudaMemcpyHostToDevice);
+
+	/* compute Laplacian */
+	convolution_kernel<<<num_tiles,tile_size,buf_size>>> (
+		d_conc_old, d_conc_lap, nx, ny, nm
+	);
+
+	/* transfer from device out to host */
+	cudaMemcpy(conc_lap[0], d_conc_lap, nx * ny * sizeof(fp_t),
+	           cudaMemcpyDeviceToHost);
+
+	/* free memory on device */
+	cudaFree(d_conc_old);
+	cudaFree(d_conc_lap);
+}
+
 __global__ void diffusion_kernel(fp_t* d_conc_old,
                                  fp_t* d_conc_new,
                                  fp_t* d_conc_lap,
@@ -123,14 +170,24 @@ __global__ void diffusion_kernel(fp_t* d_conc_old,
 
 	/* explicit Euler solution to the equation of motion */
 	if (x < nx && y < ny) {
-		d_conc_new[y * nx + x] = d_conc_old[y * nx + x]
-		                  + dt * D * d_conc_lap[y * nx + x];
+		d_conc_new[nx * y + x] = d_conc_old[nx * y + x]
+		              + dt * D * d_conc_lap[nx * y + x];
 	}
 
 	/* wait for all threads to finish writing */
 	__syncthreads();
 }
 
+/**
+ \brief Optimized code for solving the diffusion equation.
+
+ Compare cuda_diffusion_solver(): it accomplishes the same result, but without
+ the memory allocation, data transfer, and array release. These are handled in
+ cuda_init(), with arrays on the host and device managed through CudaData,
+ which is a struct passed by reference into the function. In this way,
+ device kernels can be called in isolation without incurring the cost of data
+ transfers and with reduced risk of memory leaks.
+*/
 void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new,
                            fp_t bc[2][2], int bx, int by,
                            int nm, int nx, int ny,
@@ -140,13 +197,14 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new,
 	double start_time;
 	int check=0;
 
-	/* divide matrices into blocks of TILE_W *TILE_H threads */
-	dim3 tile_size(TILE_W,
-	               TILE_H,
+	/* divide matrices into blocks of bx * by threads */
+	dim3 tile_size(bx,
+	               by,
 	               1);
 	dim3 num_tiles(ceil(float(nx) / (tile_size.x - nm + 1)),
 	               ceil(float(ny) / (tile_size.y - nm + 1)),
 	               1);
+	size_t buf_size = (tile_size.x + nm) * (tile_size.y + nm) * sizeof(fp_t);
 
 	for (check = 0; check < checks; check++) {
 		/* apply boundary conditions */
@@ -156,7 +214,7 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new,
 
 		/* compute Laplacian */
 		start_time = GetTimer();
-		convolution_kernel<<<num_tiles,tile_size>>> (
+		convolution_kernel<<<num_tiles,tile_size,buf_size>>> (
 			dev->conc_old, dev->conc_lap, nx, ny, nm
 		);
 		sw->conv += GetTimer() - start_time;
@@ -169,12 +227,11 @@ void cuda_diffusion_solver(struct CudaData* dev, fp_t** conc_new,
 		sw->step += GetTimer() - start_time;
 
 		swap_pointers_1D(&(dev->conc_old), &(dev->conc_new));
-
-		*elapsed += dt;
 	}
-	/* after swap, new data is in dev->conc_old */
 
-	/* transfer from device out to host (conc_new) */
+	*elapsed += dt * checks;
+
+	/* transfer result to host (conc_new) from device (dev->conc_old) */
 	start_time = GetTimer();
 	cudaMemcpy(conc_new[0], dev->conc_old, nx * ny * sizeof(fp_t),
 	           cudaMemcpyDeviceToHost);
@@ -229,47 +286,3 @@ void check_solution(fp_t** conc_new, fp_t** conc_lap, int nx, int ny,
 	*rss = sum;
 }
 
-void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap,
-                         int bx, int by,
-                         int nm,
-                         int nx, int ny)
-{
-	/* If you must compute the convolution separately, do so here.
-	 * It is strongly recommended that you roll CUDA tasks into one function:
-	 * This legacy function is included to show basic usage of the kernel.
-	 */
-
-	fp_t* d_conc_old, *d_conc_lap;
-
-	/* allocate memory on device */
-	cudaMalloc((void **) &d_conc_old, nx * ny * sizeof(fp_t));
-	cudaMalloc((void **) &d_conc_lap, nx * ny * sizeof(fp_t));
-
-	/* divide matrices into blocks of TILE_W * TILE_H threads */
-	dim3 tile_size(TILE_W,
-	               TILE_H,
-	               1);
-	dim3 num_tiles(ceil(float(nx) / (tile_size.x - nm + 1)),
-	               ceil(float(ny) / (tile_size.y - nm + 1)),
-	               1);
-
-	/* transfer mask in to constant device memory */
-	cudaMemcpyToSymbol(d_mask, mask_lap[0], nm * nm * sizeof(fp_t));
-
-	/* transfer data from host in to device */
-	cudaMemcpy(d_conc_old, conc_old[0], nx * ny * sizeof(fp_t),
-	           cudaMemcpyHostToDevice);
-
-	/* compute Laplacian */
-	convolution_kernel<<<num_tiles,tile_size>>> (
-		d_conc_old, d_conc_lap, nx, ny, nm
-	);
-
-	/* transfer from device out to host */
-	cudaMemcpy(conc_lap[0], d_conc_lap, nx * ny * sizeof(fp_t),
-	           cudaMemcpyDeviceToHost);
-
-	/* free memory on device */
-	cudaFree(d_conc_old);
-	cudaFree(d_conc_lap);
-}
diff --git a/gpu-opencl-diffusion/kernel_boundary.cl b/gpu-opencl-diffusion/kernel_boundary.cl
index 9842f79..c7e65a5 100644
--- a/gpu-opencl-diffusion/kernel_boundary.cl
+++ b/gpu-opencl-diffusion/kernel_boundary.cl
@@ -22,7 +22,7 @@
 */
 #pragma OPENCL EXTENSION cl_khr_fp64: enable
 
-#include "opencl_kernels.h"
+#include "numerics.h"
 
 /**
  \brief Boundary condition kernel for execution on the GPU
@@ -37,22 +37,22 @@ __kernel void boundary_kernel(__global fp_t* d_conc,
                               int ny,
                               int nm)
 {
-	int col, row;
+	int x, y;
 	int ihi, ilo, jhi, jlo, offset;
 
 	/* determine indices on which to operate */
 
-	col = get_global_id(0);
-	row = get_global_id(1);
+	x = get_global_id(0);
+	y = get_global_id(1);
 
 	/* apply fixed boundary values: sequence does not matter */
 
-	if (row < ny/2 && col < 1+nm/2) {
-		d_conc[row * nx + col] = d_bc[2]; /* left value, bc[1][0] = bc[2*1 + 0] */
+	if (x < 1+nm/2 && y < ny/2) {
+		d_conc[nx * y + x] = d_bc[2]; /* left value, bc[1][0] = bc[2*1 + 0] */
 	}
 
-	if (row >= ny/2 && row < ny && col >= nx-1-nm/2 && col < nx) {
-		d_conc[row * nx + col] = d_bc[3]; /* right value, bc[1][1] = bc[2*1 + 1] */
+	if (x >= nx-1-nm/2 && x < nx && y >= ny/2 && y < ny) {
+		d_conc[nx * y + x] = d_bc[3]; /* right value, bc[1][1] = bc[2*1 + 1] */
 	}
 
 	/* wait for all threads to finish writing */
@@ -66,17 +66,17 @@ __kernel void boundary_kernel(__global fp_t* d_conc,
 		jlo = nm/2 - offset;
 		jhi = ny - 1 - nm/2 + offset;
 
-		if (ilo-1 == col && row < ny) {
-			d_conc[row * nx + ilo-1] = d_conc[row * nx + ilo]; /* left condition */
+		if (ilo-1 == x && y < ny) {
+			d_conc[nx * y + ilo-1] = d_conc[nx * y + ilo]; /* left condition */
 		}
-		if (ihi+1 == col && row < ny) {
-			d_conc[row * nx + ihi+1] = d_conc[row * nx + ihi]; /* right condition */
+		if (ihi+1 == x && y < ny) {
+			d_conc[nx * y + ihi+1] = d_conc[nx * y + ihi]; /* right condition */
 		}
-		if (jlo-1 == row && col < nx) {
-			d_conc[(jlo-1) * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */
+		if (jlo-1 == y && x < nx) {
+			d_conc[nx * (jlo-1) + x] = d_conc[nx * jlo + x]; /* bottom condition */
 		}
-		if (jhi+1 == row && col < nx) {
-			d_conc[(jhi+1) * nx + col] = d_conc[jhi * nx + col]; /* top condition */
+		if (jhi+1 == y && x < nx) {
+			d_conc[nx * (jhi+1) + x] = d_conc[nx * jhi + x]; /* top condition */
 		}
 
 		barrier(CLK_GLOBAL_MEM_FENCE);
diff --git a/gpu-opencl-diffusion/kernel_convolution.cl b/gpu-opencl-diffusion/kernel_convolution.cl
index 291cd98..a8da030 100644
--- a/gpu-opencl-diffusion/kernel_convolution.cl
+++ b/gpu-opencl-diffusion/kernel_convolution.cl
@@ -22,7 +22,7 @@
 */
 #pragma OPENCL EXTENSION cl_khr_fp64: enable
 
-#include "opencl_kernels.h"
+#include "numerics.h"
 
 /**
  \brief Tiled convolution algorithm for execution on the GPU
@@ -44,57 +44,56 @@
  - The \a __local specifier allocates the small \a d_conc_tile array in cache
  - The \a __constant specifier allocates the small \a d_mask array in cache
 */
-__kernel void convolution_kernel(__global fp_t* d_conc_old,
-                                 __global fp_t* d_conc_lap,
+__kernel void convolution_kernel(__global   fp_t* d_conc_old,
+                                 __global   fp_t* d_conc_lap,
                                  __constant fp_t* d_mask,
+                                 __local    fp_t* d_conc_tile,
                                  int nx,
                                  int ny,
                                  int nm)
 {
 	int i, j;
-	int dst_col, dst_cols, dst_row, dst_rows;
-	int src_col, src_cols, src_row, src_rows;
-	int til_col, til_row;
+	int dst_nx, dst_ny, dst_x, dst_y;
+	int src_nx, src_ny, src_x, src_y;
+	int til_nx, til_x, til_y;
 	fp_t value = 0.;
 
 	/* source tile includes the halo cells, destination tile does not */
-	src_cols = get_local_size(0);
-	src_rows = get_local_size(1);
+	src_ny = get_local_size(0);
+	src_nx = get_local_size(1);
+	til_nx = src_nx;
 
-	dst_cols = src_cols - nm + 1;
-	dst_rows = src_rows - nm + 1;
+	dst_ny = src_ny - nm + 1;
+	dst_nx = src_nx - nm + 1;
 
 	/* determine indices on which to operate */
-	til_col = get_local_id(0);
-	til_row = get_local_id(1);
+	til_x = get_local_id(0);
+	til_y = get_local_id(1);
 
-	dst_col = get_group_id(0) * dst_cols + til_col;
-	dst_row = get_group_id(1) * dst_rows + til_row;
+	dst_x = get_group_id(0) * dst_ny + til_x;
+	dst_y = get_group_id(1) * dst_nx + til_y;
 
-	src_col = dst_col - nm/2;
-	src_row = dst_row - nm/2;
+	src_x = dst_x - nm/2;
+	src_y = dst_y - nm/2;
 
-	/* shared memory tile: __local gives access to all threads in the group */
-	__local fp_t d_conc_tile[TILE_H + MAX_MASK_H - 1][TILE_W + MAX_MASK_W - 1];
-
-	if (src_row >= 0 && src_row < ny &&
-	    src_col >= 0 && src_col < nx) {
-		d_conc_tile[til_row][til_col] = d_conc_old[src_row * nx + src_col];
+	if (src_x >= 0 && src_x < nx &&
+	    src_y >= 0 && src_y < ny) {
+		d_conc_tile[til_nx * til_y + til_x] = d_conc_old[nx * src_y + src_x];
 	}
 
 	/* tile data is shared: wait for all threads to finish copying */
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	/* compute the convolution */
-	if (til_col < dst_cols && til_row < dst_rows) {
+	if (til_x < dst_ny && til_y < dst_nx) {
 		for (j = 0; j < nm; j++) {
 			for (i = 0; i < nm; i++) {
-				value += d_mask[j * nm + i] * d_conc_tile[j+til_row][i+til_col];
+				value += d_mask[nm * j + i] * d_conc_tile[til_nx * (til_y+j) + til_x+i];
 			}
 		}
 		/* record value */
-		if (dst_row < ny && dst_col < nx) {
-			d_conc_lap[dst_row * nx + dst_col] = value;
+		if (dst_y < ny && dst_x < nx) {
+			d_conc_lap[nx * dst_y + dst_x] = value;
 		}
 	}
 
diff --git a/gpu-opencl-diffusion/kernel_diffusion.cl b/gpu-opencl-diffusion/kernel_diffusion.cl
index 136b143..91e3f64 100644
--- a/gpu-opencl-diffusion/kernel_diffusion.cl
+++ b/gpu-opencl-diffusion/kernel_diffusion.cl
@@ -22,7 +22,7 @@
 */
 #pragma OPENCL EXTENSION cl_khr_fp64: enable
 
-#include "opencl_kernels.h"
+#include "numerics.h"
 
 /**
  \brief Diffusion equation kernel for execution on the GPU
diff --git a/gpu-opencl-diffusion/opencl_boundaries.c b/gpu-opencl-diffusion/opencl_boundaries.c
index b90fd84..060883b 100644
--- a/gpu-opencl-diffusion/opencl_boundaries.c
+++ b/gpu-opencl-diffusion/opencl_boundaries.c
@@ -25,10 +25,8 @@
 #include <math.h>
 #include <omp.h>
 #include <CL/cl.h>
-
 #include "boundaries.h"
-
-#include "opencl_kernels.h"
+#include "numerics.h"
 
 void set_boundaries(fp_t bc[2][2])
 {
diff --git a/gpu-opencl-diffusion/opencl_data.c b/gpu-opencl-diffusion/opencl_data.c
index 70cbbee..5d8f8be 100644
--- a/gpu-opencl-diffusion/opencl_data.c
+++ b/gpu-opencl-diffusion/opencl_data.c
@@ -26,9 +26,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-
+#include "numerics.h"
 #include "opencl_data.h"
-#include "opencl_kernels.h"
 
 void report_error(cl_int status, const char* message)
 {
diff --git a/gpu-opencl-diffusion/opencl_data.h b/gpu-opencl-diffusion/opencl_data.h
index eab8dd6..723041d 100644
--- a/gpu-opencl-diffusion/opencl_data.h
+++ b/gpu-opencl-diffusion/opencl_data.h
@@ -112,7 +112,7 @@ void init_opencl(fp_t** conc_old, fp_t** mask_lap, fp_t bc[2][2],
  \brief Specialization of solve_diffusion_equation() using OpenCL
 */
 void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
-                             int nx, int ny, int nm,
+                             int bx, int by, int nx, int ny, int nm,
                              fp_t D, fp_t dt, int checks,
                              fp_t *elapsed, struct Stopwatch* sw);
 
diff --git a/gpu-opencl-diffusion/opencl_discretization.c b/gpu-opencl-diffusion/opencl_discretization.c
index d9cdf65..9093c0d 100644
--- a/gpu-opencl-diffusion/opencl_discretization.c
+++ b/gpu-opencl-diffusion/opencl_discretization.c
@@ -28,11 +28,12 @@
 #include "boundaries.h"
 #include "discretization.h"
 #include "mesh.h"
+#include "numerics.h"
 #include "timer.h"
 #include "opencl_data.h"
-#include "opencl_kernels.h"
 
 void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
+                             int bx, int by,
                              int nx, int ny, int nm,
                              fp_t D, fp_t dt, int checks,
                              fp_t *elapsed, struct Stopwatch* sw)
@@ -46,12 +47,12 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
 	 power of two: 4, 8, 16, 32, etc. OpenCL will make a best-guess optimal
 	 block size if you set size_t* tile_dim = NULL.
 	*/
-	size_t tile_dim[2] = {TILE_W,
-	                      TILE_H};
+	size_t tile_dim[2] = {bx, by};
 	size_t bloc_dim[2] = {ceil((float)(nx) / (tile_dim[0] - nm + 1)),
 	                      ceil((float)(ny) / (tile_dim[1] - nm + 1))};
 	size_t grid_dim[2] = {bloc_dim[0] * tile_dim[0],
 	                      bloc_dim[1] * tile_dim[1]};
+	size_t buf_size = (tile_dim[0] + nm) * (tile_dim[1] + nm) * sizeof(fp_t);
 
 	cl_mem d_conc_old = dev->conc_old;
 	cl_mem d_conc_new = dev->conc_new;
@@ -67,9 +68,10 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
 
 	status |= clSetKernelArg(dev->convolution_kernel, 1, sizeof(cl_mem), (void *)&(dev->conc_lap));
 	status |= clSetKernelArg(dev->convolution_kernel, 2, sizeof(cl_mem), (void *)&(dev->mask));
-	status |= clSetKernelArg(dev->convolution_kernel, 3, sizeof(int), (void *)&nx);
-	status |= clSetKernelArg(dev->convolution_kernel, 4, sizeof(int), (void *)&ny);
-	status |= clSetKernelArg(dev->convolution_kernel, 5, sizeof(int), (void *)&nm);
+	status |= clSetKernelArg(dev->convolution_kernel, 3, buf_size, NULL);
+	status |= clSetKernelArg(dev->convolution_kernel, 4, sizeof(int), (void *)&nx);
+	status |= clSetKernelArg(dev->convolution_kernel, 5, sizeof(int), (void *)&ny);
+	status |= clSetKernelArg(dev->convolution_kernel, 6, sizeof(int), (void *)&nm);
 	report_error(status, "constant convolution kernel args");
 
 	status |= clSetKernelArg(dev->diffusion_kernel, 2, sizeof(cl_mem), (void *)&(dev->conc_lap));
@@ -94,25 +96,33 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
 		}
 
 		/* set time-dependent kernel arguments */
-		status = clSetKernelArg(dev->boundary_kernel,    0, sizeof(cl_mem), (void *)&d_conc_old);
+		status = clSetKernelArg(dev->boundary_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old);
 		report_error(status, "mutable boundary kernel args");
 
 		status = clSetKernelArg(dev->convolution_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old);
 		report_error(status, "mutable convolution kernel args");
 
-		status |= clSetKernelArg(dev->diffusion_kernel,   0, sizeof(cl_mem), (void *)&d_conc_old);
-		status |= clSetKernelArg(dev->diffusion_kernel,   1, sizeof(cl_mem), (void *)&d_conc_new);
+		status |= clSetKernelArg(dev->diffusion_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old);
+		status |= clSetKernelArg(dev->diffusion_kernel, 1, sizeof(cl_mem), (void *)&d_conc_new);
 		report_error(status, "mutable diffusion kernel args");
 
-		/* enqueue kernels */
+		/* apply boundary conditions */
 		status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->boundary_kernel,    2, NULL, grid_dim, tile_dim, 0, NULL, NULL);
+
+		/* compute Laplacian */
+		start_time = GetTimer();
 		status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->convolution_kernel, 2, NULL, grid_dim, tile_dim, 0, NULL, NULL);
+		sw->conv += GetTimer() - start_time;
+
+		/* compute result */
+		start_time = GetTimer();
 		status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel,   2, NULL, grid_dim, tile_dim, 0, NULL, NULL);
-		report_error(status, "enqueue kernels");
+		sw->step += GetTimer() - start_time;
 
-		*elapsed += dt;
+		report_error(status, "enqueue kernels");
 	}
 
+	*elapsed += dt * checks;
 
 	/* transfer from device out to host */
 	start_time = GetTimer();
diff --git a/gpu-opencl-diffusion/opencl_kernels.h b/gpu-opencl-diffusion/opencl_kernels.h
deleted file mode 100644
index 2afc6ca..0000000
--- a/gpu-opencl-diffusion/opencl_kernels.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/**********************************************************************************
- HiPerC: High Performance Computing Strategies for Boundary Value Problems
- written by Trevor Keller and available from https://github.com/usnistgov/hiperc
-
- This software was developed at the National Institute of Standards and Technology
- by employees of the Federal Government in the course of their official duties.
- Pursuant to title 17 section 105 of the United States Code this software is not
- subject to copyright protection and is in the public domain. NIST assumes no
- responsibility whatsoever for the use of this software by other parties, and makes
- no guarantees, expressed or implied, about its quality, reliability, or any other
- characteristic. We would appreciate acknowledgement if the software is used.
-
- This software can be redistributed and/or modified freely provided that any
- derivative works bear some notice that they are derived from it, and any modified
- versions bear some notice that they have been modified.
-
- Questions/comments to Trevor Keller (trevor.keller@nist.gov)
- **********************************************************************************/
-
-/**
- \file  opencl_kernels.h
- \brief Declaration of functions to execute on the GPU (OpenCL kernels)
-*/
-
-/** \cond SuppressGuard */
-#ifndef _OPENCL_KERNELS_H_
-#define _OPENCL_KERNELS_H_
-/** \endcond */
-
-#include "numerics.h"
-
-/**
- \brief Width of an input tile, including halo cells, for GPU memory allocation
-*/
-#define TILE_W 32
-
-/**
- \brief Height of an input tile, including halo cells, for GPU memory allocation
-*/
-#define TILE_H 32
-
-/** \cond SuppressGuard */
-#endif /* _OPENCL_KERNELS_H_ */
-/** \endcond */
diff --git a/gpu-opencl-diffusion/opencl_main.c b/gpu-opencl-diffusion/opencl_main.c
index 245fa8f..7a95d7d 100644
--- a/gpu-opencl-diffusion/opencl_main.c
+++ b/gpu-opencl-diffusion/opencl_main.c
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
 
 		assert(step + checks <= steps);
 
-		opencl_diffusion_solver(&dev, conc_new, nx, ny, nm, D, dt, checks, &elapsed, &sw);
+		opencl_diffusion_solver(&dev, conc_new, bx, by, nx, ny, nm, D, dt, checks, &elapsed, &sw);
 
 		for (i = 0; i < checks; i++) {
 			step++;

From 4c78f9f23e3f359851b3cd7b53d03bce7f5bf1aa Mon Sep 17 00:00:00 2001
From: Trevor Keller <trevor.keller@nist.gov>
Date: Tue, 17 Oct 2017 18:15:42 -0400
Subject: [PATCH 4/4] Fix usage of OpenAcc data flow controls; 4x slower than
 tiled GPU algorithms (CUDA, OpenCL) Note: apply `#pragma acc declare present`
 in routines.

Closes #49.
---
 gpu-openacc-diffusion/Makefile                |  2 +-
 gpu-openacc-diffusion/openacc_boundaries.c    | 59 ++++++++++---------
 .../openacc_discretization.c                  | 24 +++-----
 3 files changed, 39 insertions(+), 46 deletions(-)

diff --git a/gpu-openacc-diffusion/Makefile b/gpu-openacc-diffusion/Makefile
index 452a19e..02f3da8 100644
--- a/gpu-openacc-diffusion/Makefile
+++ b/gpu-openacc-diffusion/Makefile
@@ -2,7 +2,7 @@
 # OpenACC implementation
 
 CXX = pgcc
-CXXFLAGS = -O3 -I../common-diffusion -acc -ta=tesla -Minfo=accel -mp
+CXXFLAGS = -O3 -I../common-diffusion -acc -ta=tesla -ta=tesla:cc30 -ta=tesla:cc50 -ta=tesla:cc60 -Minfo=accel -mp
 LINKS = -lm -lpng
 
 OBJS = boundaries.o discretization.o mesh.o numerics.o output.o timer.o
diff --git a/gpu-openacc-diffusion/openacc_boundaries.c b/gpu-openacc-diffusion/openacc_boundaries.c
index 6627782..d36a309 100644
--- a/gpu-openacc-diffusion/openacc_boundaries.c
+++ b/gpu-openacc-diffusion/openacc_boundaries.c
@@ -59,47 +59,48 @@ void apply_initial_conditions(fp_t** conc, int nx, int ny, int nm, fp_t bc[2][2]
 	}
 }
 
-void boundary_kernel(fp_t** conc, int nx, int ny, int nm, fp_t bc[2][2])
+void boundary_kernel(fp_t** __restrict__ conc, int nx, int ny, int nm, fp_t bc[2][2])
 {
 	/* apply fixed boundary values: sequence does not matter */
 
-	#pragma acc parallel
+	#pragma acc declare present(conc[0:ny][0:nx], bc[0:2][0:2])
 	{
-		#pragma acc loop
-		for (int j = 0; j < ny/2; j++) {
-			#pragma acc loop
-			for (int i = 0; i < 1+nm/2; i++) {
-				conc[j][i] = bc[1][0]; /* left value */
+		#pragma acc parallel
+		{
+			#pragma acc loop independent collapse(2)
+			for (int j = 0; j < ny/2; j++) {
+				for (int i = 0; i < 1+nm/2; i++) {
+					conc[j][i] = bc[1][0]; /* left value */
+				}
 			}
-		}
 
-		#pragma acc loop
-		for (int j = ny/2; j < ny; j++) {
-			#pragma acc loop
-			for (int i = nx-1-nm/2; i < nx; i++) {
-				conc[j][i] = bc[1][1]; /* right value */
+			#pragma acc loop independent collapse(2)
+			for (int j = ny/2; j < ny; j++) {
+				for (int i = nx-1-nm/2; i < nx; i++) {
+					conc[j][i] = bc[1][1]; /* right value */
+				}
 			}
 		}
 
 		/* apply no-flux boundary conditions: inside to out, sequence matters */
 
 		for (int offset = 0; offset < nm/2; offset++) {
-			int ilo = nm/2 - offset;
-			int ihi = nx - 1 - nm/2 + offset;
-			#pragma acc loop
-			for (int j = 0; j < ny; j++) {
-				conc[j][ilo-1] = conc[j][ilo]; /* left condition */
-				conc[j][ihi+1] = conc[j][ihi]; /* right condition */
-			}
-		}
-
-		for (int offset = 0; offset < nm/2; offset++) {
-			int jlo = nm/2 - offset;
-			int jhi = ny - 1 - nm/2 + offset;
-			#pragma acc loop
-			for (int i = 0; i < nx; i++) {
-				conc[jlo-1][i] = conc[jlo][i]; /* bottom condition */
-				conc[jhi+1][i] = conc[jhi][i]; /* top condition */
+			#pragma acc parallel
+			{
+				int ilo = nm/2 - offset;
+				int ihi = nx - 1 - nm/2 + offset;
+				int jlo = nm/2 - offset;
+				int jhi = ny - 1 - nm/2 + offset;
+				#pragma acc loop independent
+				for (int j = 0; j < ny; j++) {
+					conc[j][ilo-1] = conc[j][ilo]; /* left condition */
+					conc[j][ihi+1] = conc[j][ihi]; /* right condition */
+				}
+				#pragma acc loop independent
+				for (int i = 0; i < nx; i++) {
+					conc[jlo-1][i] = conc[jlo][i]; /* bottom condition */
+					conc[jhi+1][i] = conc[jhi][i]; /* top condition */
+				}
 			}
 		}
 	}
diff --git a/gpu-openacc-diffusion/openacc_discretization.c b/gpu-openacc-diffusion/openacc_discretization.c
index 677a20b..c984de0 100644
--- a/gpu-openacc-diffusion/openacc_discretization.c
+++ b/gpu-openacc-diffusion/openacc_discretization.c
@@ -31,13 +31,14 @@
 
 void convolution_kernel(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, int nx, int ny, int nm)
 {
+	#pragma acc declare present(conc_old[0:ny][0:nx], conc_lap[0:ny][0:nx], mask_lap[0:nm][0:nm])
 	#pragma acc parallel
 	{
-		#pragma acc loop
+		#pragma acc loop collapse(2)
 		for (int j = nm/2; j < ny-nm/2; j++) {
-			#pragma acc loop
 			for (int i = nm/2; i < nx-nm/2; i++) {
 				fp_t value = 0.;
+				#pragma acc loop seq collapse(2)
 				for (int mj = -nm/2; mj < 1+nm/2; mj++) {
 					for (int mi = -nm/2; mi < 1+nm/2; mi++) {
 						value += mask_lap[mj+nm/2][mi+nm/2] * conc_old[j+mj][i+mi];
@@ -52,11 +53,11 @@ void convolution_kernel(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap, int n
 void diffusion_kernel(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap,
                       int nx, int ny, int nm, fp_t D, fp_t dt)
 {
+	#pragma acc declare present(conc_old[0:ny][0:nx], conc_new[0:ny][0:nx], conc_lap[0:ny][0:nx])
 	#pragma acc parallel
 	{
-		#pragma acc loop
+		#pragma acc loop collapse(2)
 		for (int j = nm/2; j < ny-nm/2; j++) {
-			#pragma acc loop
 			for (int i = nm/2; i < nx-nm/2; i++) {
 				conc_new[j][i] = conc_old[j][i] + dt * D * conc_lap[j][i];
 			}
@@ -64,23 +65,14 @@ void diffusion_kernel(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap,
 	}
 }
 
-
-void compute_convolution(fp_t** conc_old, fp_t** conc_lap, fp_t** mask_lap,
-                         int nx, int ny, int nm)
-{
-	/* If you must compute the convolution separately, do so here. */
-	#pragma acc data copyin(conc_old[0:ny][0:nx], mask_lap[0:nm][0:nm]) create(conc_lap[0:ny][0:nx]) copyout(conc_lap[0:ny][0:nx])
-	{
-		convolution_kernel(conc_old, conc_lap, mask_lap, nx, ny, nm);
-	}
-}
-
 void solve_diffusion_equation(fp_t** conc_old, fp_t** conc_new, fp_t** conc_lap,
                               fp_t** mask_lap, int nx, int ny, int nm,
                               fp_t bc[2][2], fp_t D, fp_t dt, int checks,
                               fp_t* elapsed, struct Stopwatch* sw)
 {
-	#pragma acc data copy(conc_old[0:ny][0:nx], mask_lap[0:nm][0:nm], bc[0:2][0:2]) create(conc_lap[0:ny][0:nx], conc_new[0:ny][0:nx])
+	#pragma acc data present_or_copy(conc_old[0:ny][0:nx]) \
+	                 present_or_copyin(mask_lap[0:nm][0:nm], bc[0:2][0:2]) \
+	                 present_or_create(conc_lap[0:ny][0:nx], conc_new[0:ny][0:nx])
 	{
 		double start_time=0.;
 		int check=0;