Skip to content

Commit

Permalink
streamlining
Browse files Browse the repository at this point in the history
  • Loading branch information
tkphd committed Sep 26, 2017
1 parent 7c5149f commit 4507500
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 102 deletions.
6 changes: 4 additions & 2 deletions common-diffusion/params.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
nx 512 # mesh points along x-axis
ny 512 # mesh points along y-axis
nx 512 # total mesh points along x-axis
ny 512 # total mesh points along y-axis
dx 0.5 # mesh resolution along x-axis
dy 0.5 # mesh resolution along y-axis
bx 32 # convolution block size along x-axis
by 32 # convolution block size along y-axis
ns 100000 # number of timesteps to march
nc 10000 # number of timesteps between checkpoint outputs
dc 0.00625 # diffusion coefficient
Expand Down
28 changes: 14 additions & 14 deletions gpu-cuda-diffusion/cuda_boundaries.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ __global__ void boundary_kernel(fp_t* d_conc,

/* apply fixed boundary values: sequence does not matter */

if (row >= 0 && row < ny/2 && col >= 0 && col < 1+nm/2) {
if (row < ny/2 && col < 1+nm/2) {
d_conc[row * nx + col] = d_bc[1][0]; /* left value */
}

Expand All @@ -99,22 +99,22 @@ __global__ void boundary_kernel(fp_t* d_conc,
for (offset = 0; offset < nm/2; offset++) {
ilo = nm/2 - offset;
ihi = nx - 1 - nm/2 + offset;
if (col == ilo-1 && row >= 0 && row < ny) {
d_conc[row * nx + col] = d_conc[row * nx + ilo]; /* left condition */
} else if (col == ihi+1 && row >= 0 && row < ny) {
d_conc[row * nx + col] = d_conc[row * nx + ihi]; /* right condition */
}
__syncthreads();
}

for (offset = 0; offset < nm/2; offset++) {
jlo = nm/2 - offset;
jhi = ny - 1 - nm/2 + offset;
if (row == jlo-1 && col >= 0 && col < nx) {
d_conc[row * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */
} else if (row == jhi+1 && col >= 0 && col < nx) {
d_conc[row * nx + col] = d_conc[jhi * nx + col]; /* top condition */

if (ilo-1 == col && row < ny) {
d_conc[row * nx + ilo-1] = d_conc[row * nx + ilo]; /* left condition */
}
if (ihi+1 == col && row < ny) {
d_conc[row * nx + ihi+1] = d_conc[row * nx + ihi]; /* right condition */
}
if (jlo-1 == row && col < nx) {
d_conc[(jlo-1) * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */
}
if (jhi+1 == row && col < nx) {
d_conc[(jhi+1) * nx + col] = d_conc[jhi * nx + col]; /* top condition */
}

__syncthreads();
}
}
5 changes: 3 additions & 2 deletions gpu-opencl-diffusion/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
# OpenCL implementation

CC = gcc
CFLAGS = -g -Wall -pedantic -std=c11 -I../common-diffusion -fopenmp
CFLAGS = -O3 -Wall -pedantic -std=c11 -I../common-diffusion -fopenmp
LINKS = -lm -lpng -lOpenCL

LLVM = clang
LFLAGS = -S -emit-llvm -x cl -I../common-diffusion

KERNELS = kernel_boundary.cl kernel_convolution.cl kernel_diffusion.cl
OBJS = boundaries.o data.o discretization.o mesh.o numerics.o output.o timer.o

# Executable
diffusion: opencl_main.c $(OBJS)
diffusion: opencl_main.c $(KERNELS) $(OBJS)
$(CC) $(CFLAGS) $(OBJS) $< -o $@ $(LINKS)

# OpenCL objects
Expand Down
35 changes: 17 additions & 18 deletions gpu-opencl-diffusion/kernel_boundary.cl
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,22 @@ __kernel void boundary_kernel(__global fp_t* d_conc,
int ny,
int nm)
{
int idx, col, row;
int col, row;
int ihi, ilo, jhi, jlo, offset;

/* determine indices on which to operate */

col = get_global_id(0);
row = get_global_id(1);
idx = row * nx + col;

/* apply fixed boundary values: sequence does not matter */

if (row >= 0 && row < ny/2 && col >= 0 && col < 1+nm/2) {
d_conc[idx] = d_bc[2]; /* left value, bc[1][0] = bc[2*1 + 0] */
if (row < ny/2 && col < 1+nm/2) {
d_conc[row * nx + col] = d_bc[2]; /* left value, bc[1][0] = bc[2*1 + 0] */
}

if (row >= ny/2 && row < ny && col >= nx-1-nm/2 && col < nx) {
d_conc[idx] = d_bc[3]; /* right value, bc[1][1] = bc[2*1 + 1] */
d_conc[row * nx + col] = d_bc[3]; /* right value, bc[1][1] = bc[2*1 + 1] */
}

/* wait for all threads to finish writing */
Expand All @@ -64,22 +63,22 @@ __kernel void boundary_kernel(__global fp_t* d_conc,
for (offset = 0; offset < nm/2; offset++) {
ilo = nm/2 - offset;
ihi = nx - 1 - nm/2 + offset;
if (col == ilo-1 && row >= 0 && row < ny) {
d_conc[idx] = d_conc[row * nx + ilo]; /* left condition */
} else if (col == ihi+1 && row >= 0 && row < ny) {
d_conc[idx] = d_conc[row * nx + ihi]; /* right condition */
}
barrier(CLK_GLOBAL_MEM_FENCE);
}

for (offset = 0; offset < nm/2; offset++) {
jlo = nm/2 - offset;
jhi = ny - 1 - nm/2 + offset;
if (row == jlo-1 && col >= 0 && col < nx) {
d_conc[idx] = d_conc[jlo * nx + col]; /* bottom condition */
} else if (row == jhi+1 && col >= 0 && col < nx) {
d_conc[idx] = d_conc[jhi * nx + col]; /* top condition */

if (ilo-1 == col && row < ny) {
d_conc[row * nx + ilo-1] = d_conc[row * nx + ilo]; /* left condition */
}
if (ihi+1 == col && row < ny) {
d_conc[row * nx + ihi+1] = d_conc[row * nx + ihi]; /* right condition */
}
if (jlo-1 == row && col < nx) {
d_conc[(jlo-1) * nx + col] = d_conc[jlo * nx + col]; /* bottom condition */
}
if (jhi+1 == row && col < nx) {
d_conc[(jhi+1) * nx + col] = d_conc[jhi * nx + col]; /* top condition */
}

barrier(CLK_GLOBAL_MEM_FENCE);
}
}
35 changes: 16 additions & 19 deletions gpu-opencl-diffusion/kernel_convolution.cl
Original file line number Diff line number Diff line change
Expand Up @@ -51,38 +51,35 @@ __kernel void convolution_kernel(__global fp_t* d_conc_old,
int ny,
int nm)
{
int i, j, til_col, til_row;
int dst_col, dst_idx, dst_row, dst_cols, dst_rows;
int src_col, src_idx, src_row;
fp_t value=0.;
int i, j;
int dst_col, dst_cols, dst_row, dst_rows;
int src_col, src_cols, src_row, src_rows;
int til_col, til_row;
fp_t value = 0.;

/* source tile includes the halo cells, destination tile does not */
dst_cols = get_local_size(0) - nm + 1;
dst_rows = get_local_size(1) - nm + 1;
src_cols = get_local_size(0);
src_rows = get_local_size(1);

dst_cols = src_cols - nm + 1;
dst_rows = src_rows - nm + 1;

/* determine indices on which to operate */
til_col = get_local_id(0);
til_row = get_local_id(1);

dst_col = get_group_id(0) * dst_cols + til_col; /* not get_global_id(0); */
dst_row = get_group_id(1) * dst_rows + til_row; /* not get_global_id(1); */
dst_col = get_group_id(0) * dst_cols + til_col;
dst_row = get_group_id(1) * dst_rows + til_row;

src_col = dst_col - nm/2;
src_row = dst_row - nm/2;

dst_idx = dst_row * nx + dst_col;
src_idx = src_row * nx + src_col;

/* shared memory tile: __local gives access to all threads in the group */
__local fp_t d_conc_tile[TILE_H + MAX_MASK_H - 1][TILE_W + MAX_MASK_W - 1];

if ((src_row >= 0) && (src_row < ny) &&
(src_col >= 0) && (src_col < nx)) {
/* if src_row==0, then dst_row==nm/2: this is a halo row */
d_conc_tile[til_row][til_col] = d_conc_old[src_idx];
} else {
/* points outside the halo should be switched off */
d_conc_tile[til_row][til_col] = 0.;
if (src_row >= 0 && src_row < ny &&
src_col >= 0 && src_col < nx) {
d_conc_tile[til_row][til_col] = d_conc_old[src_row * nx + src_col];
}

/* tile data is shared: wait for all threads to finish copying */
Expand All @@ -97,7 +94,7 @@ __kernel void convolution_kernel(__global fp_t* d_conc_old,
}
/* record value */
if (dst_row < ny && dst_col < nx) {
d_conc_lap[dst_idx] = value;
d_conc_lap[dst_row * nx + dst_col] = value;
}
}

Expand Down
6 changes: 6 additions & 0 deletions gpu-opencl-diffusion/opencl_data.c
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,12 @@ void build_program(const char* filename,
void free_opencl(struct OpenCLData* dev)
{
/* clean up */
free(dev->conc_old);
free(dev->conc_new);
free(dev->conc_lap);
free(dev->bc);
free(dev->mask);

clReleaseContext(dev->context);

clReleaseKernel(dev->boundary_kernel);
Expand Down
76 changes: 29 additions & 47 deletions gpu-opencl-diffusion/opencl_discretization.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,38 +55,26 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
cl_int status = CL_SUCCESS;

/* set immutable kernel arguments */
status = clSetKernelArg(dev->boundary_kernel, 1, sizeof(cl_mem), (void *)&(dev->bc));
report_error(status, "const boundary args[1]");
status = clSetKernelArg(dev->boundary_kernel, 2, sizeof(int), (void *)&nx);
report_error(status, "const boundary args[2]");
status = clSetKernelArg(dev->boundary_kernel, 3, sizeof(int), (void *)&ny);
report_error(status, "const boundary args[3]");
status = clSetKernelArg(dev->boundary_kernel, 4, sizeof(int), (void *)&nm);
report_error(status, "const boundary args[4]");

status = clSetKernelArg(dev->convolution_kernel, 1, sizeof(cl_mem), (void *)&(dev->conc_lap));
report_error(status, "const convolution args[1]");
status = clSetKernelArg(dev->convolution_kernel, 2, sizeof(cl_mem), (void *)&(dev->mask));
report_error(status, "const convolution args[2]");
status = clSetKernelArg(dev->convolution_kernel, 3, sizeof(int), (void *)&nx);
report_error(status, "const convolution args[3]");
status = clSetKernelArg(dev->convolution_kernel, 4, sizeof(int), (void *)&ny);
report_error(status, "const convolution args[4]");
status = clSetKernelArg(dev->convolution_kernel, 5, sizeof(int), (void *)&nm);
report_error(status, "const convolution args[5]");

status = clSetKernelArg(dev->diffusion_kernel, 2, sizeof(cl_mem), (void *)&(dev->conc_lap));
report_error(status, "const diffusion args[2]");
status = clSetKernelArg(dev->diffusion_kernel, 3, sizeof(int), (void *)&nx);
report_error(status, "const diffusion args[3]");
status = clSetKernelArg(dev->diffusion_kernel, 4, sizeof(int), (void *)&ny);
report_error(status, "const diffusion args[4]");
status = clSetKernelArg(dev->diffusion_kernel, 5, sizeof(int), (void *)&nm);
report_error(status, "const diffusion args[5]");
status = clSetKernelArg(dev->diffusion_kernel, 6, sizeof(fp_t), (void *)&D);
report_error(status, "const diffusion args[6]");
status = clSetKernelArg(dev->diffusion_kernel, 7, sizeof(fp_t), (void *)&dt);
report_error(status, "const diffusion args[7]");
status |= clSetKernelArg(dev->boundary_kernel, 1, sizeof(cl_mem), (void *)&(dev->bc));
status |= clSetKernelArg(dev->boundary_kernel, 2, sizeof(int), (void *)&nx);
status |= clSetKernelArg(dev->boundary_kernel, 3, sizeof(int), (void *)&ny);
status |= clSetKernelArg(dev->boundary_kernel, 4, sizeof(int), (void *)&nm);
report_error(status, "constant boundary kernal args");

status |= clSetKernelArg(dev->convolution_kernel, 1, sizeof(cl_mem), (void *)&(dev->conc_lap));
status |= clSetKernelArg(dev->convolution_kernel, 2, sizeof(cl_mem), (void *)&(dev->mask));
status |= clSetKernelArg(dev->convolution_kernel, 3, sizeof(int), (void *)&nx);
status |= clSetKernelArg(dev->convolution_kernel, 4, sizeof(int), (void *)&ny);
status |= clSetKernelArg(dev->convolution_kernel, 5, sizeof(int), (void *)&nm);
report_error(status, "constant convolution kernel args");

status |= clSetKernelArg(dev->diffusion_kernel, 2, sizeof(cl_mem), (void *)&(dev->conc_lap));
status |= clSetKernelArg(dev->diffusion_kernel, 3, sizeof(int), (void *)&nx);
status |= clSetKernelArg(dev->diffusion_kernel, 4, sizeof(int), (void *)&ny);
status |= clSetKernelArg(dev->diffusion_kernel, 5, sizeof(int), (void *)&nm);
status |= clSetKernelArg(dev->diffusion_kernel, 6, sizeof(fp_t), (void *)&D);
status |= clSetKernelArg(dev->diffusion_kernel, 7, sizeof(fp_t), (void *)&dt);
report_error(status, "constant diffusion kernel args");

/* OpenCL uses cl_mem, not fp_t*, so swap_pointers won't work.
* We leave the pointers alone but call the kernel on the appropriate data location.
Expand All @@ -103,26 +91,20 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,

/* set time-dependent kernel arguments */
status = clSetKernelArg(dev->boundary_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old);
report_error(status, "mutable boundary args[0]");
report_error(status, "mutable boundary kernel args");

status = clSetKernelArg(dev->convolution_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old);
report_error(status, "mutable convolution args[0]");
report_error(status, "mutable convolution kernel args");

status = clSetKernelArg(dev->diffusion_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old);
report_error(status, "mutable diffusion args[0]");

status = clSetKernelArg(dev->diffusion_kernel, 1, sizeof(cl_mem), (void *)&d_conc_new);
report_error(status, "mutable diffusion args[1]");
status |= clSetKernelArg(dev->diffusion_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old);
status |= clSetKernelArg(dev->diffusion_kernel, 1, sizeof(cl_mem), (void *)&d_conc_new);
report_error(status, "mutable diffusion kernel args");

/* enqueue kernels */
status = clEnqueueNDRangeKernel(dev->commandQueue, dev->boundary_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL);
report_error(status, "enqueue boundary kernel");

status = clEnqueueNDRangeKernel(dev->commandQueue, dev->convolution_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL);
report_error(status, "enqueue convolution kernel");

status = clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL);
report_error(status, "enqueue diffusion kernel");
status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->boundary_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL);
status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->convolution_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL);
status |= clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL);
report_error(status, "enqueue kernels");
}

*elapsed += dt * checks;
Expand Down

0 comments on commit 4507500

Please sign in to comment.