Skip to content

Commit

Permalink
fix OpenCL pointer swapping
Browse files Browse the repository at this point in the history
  • Loading branch information
tkphd committed Sep 20, 2017
1 parent 8fd5556 commit 7c5149f
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 32 deletions.
43 changes: 19 additions & 24 deletions gpu-opencl-diffusion/kernel_convolution.cl
Original file line number Diff line number Diff line change
Expand Up @@ -51,58 +51,53 @@ __kernel void convolution_kernel(__global fp_t* d_conc_old,
int ny,
int nm)
{
int bx, by, i, j, tx, ty;
int dst_row, dst_col, dst_tile_w, dst_tile_h;
int src_row, src_col, src_tile_w, src_tile_h;
int i, j, til_col, til_row;
int dst_col, dst_idx, dst_row, dst_cols, dst_rows;
int src_col, src_idx, src_row;
fp_t value=0.;

/* source tile width includes the halo cells */
src_tile_w = get_local_size(0);
src_tile_h = get_local_size(1);

/* destination tile width excludes the halo cells */
dst_tile_w = src_tile_w - nm + 1;
dst_tile_h = src_tile_h - nm + 1;

/* source block (working group) */
bx = get_group_id(0);
by = get_group_id(1);
/* source tile includes the halo cells, destination tile does not */
dst_cols = get_local_size(0) - nm + 1;
dst_rows = get_local_size(1) - nm + 1;

/* determine indices on which to operate */
tx = get_local_id(0);
ty = get_local_id(1);
til_col = get_local_id(0);
til_row = get_local_id(1);

dst_col = bx * dst_tile_w + tx;
dst_row = by * dst_tile_h + ty;
dst_col = get_group_id(0) * dst_cols + til_col; /* not get_global_id(0); */
dst_row = get_group_id(1) * dst_rows + til_row; /* not get_global_id(1); */

src_col = dst_col - nm/2;
src_row = dst_row - nm/2;

/* copy tile: __local gives access to all threads working on this tile */
dst_idx = dst_row * nx + dst_col;
src_idx = src_row * nx + src_col;

/* shared memory tile: __local gives access to all threads in the group */
__local fp_t d_conc_tile[TILE_H + MAX_MASK_H - 1][TILE_W + MAX_MASK_W - 1];

if ((src_row >= 0) && (src_row < ny) &&
(src_col >= 0) && (src_col < nx)) {
/* if src_row==0, then dst_row==nm/2: this is a halo row */
d_conc_tile[ty][tx] = d_conc_old[src_row * nx + src_col];
d_conc_tile[til_row][til_col] = d_conc_old[src_idx];
} else {
/* points outside the halo should be switched off */
d_conc_tile[ty][tx] = 0.;
d_conc_tile[til_row][til_col] = 0.;
}

/* tile data is shared: wait for all threads to finish copying */
barrier(CLK_LOCAL_MEM_FENCE);

/* compute the convolution */
if (tx < dst_tile_w && ty < dst_tile_h) {
if (til_col < dst_cols && til_row < dst_rows) {
for (j = 0; j < nm; j++) {
for (i = 0; i < nm; i++) {
value += d_mask[j * nm + i] * d_conc_tile[j+ty][i+tx];
value += d_mask[j * nm + i] * d_conc_tile[j+til_row][i+til_col];
}
}
/* record value */
if (dst_row < ny && dst_col < nx) {
d_conc_lap[dst_row * nx + dst_col] = value;
d_conc_lap[dst_idx] = value;
}
}

Expand Down
19 changes: 11 additions & 8 deletions gpu-opencl-diffusion/opencl_discretization.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,8 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
power of two: 4, 8, 16, 32, etc. OpenCL will make a best-guess optimal
block size if you set size_t* block_dim = NULL.
*/
size_t bx = TILE_W;
size_t by = TILE_H;
size_t grid_dim[2] = {(size_t)nx, (size_t)ny};
size_t block_dim[2] = {bx, by};
size_t block_dim[2] = {TILE_W, TILE_H};

cl_mem d_conc_old = dev->conc_old;
cl_mem d_conc_new = dev->conc_new;
Expand Down Expand Up @@ -94,6 +92,15 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,
* We leave the pointers alone but call the kernel on the appropriate data location.
*/
for (check = 0; check < checks; check++) {
/* swap pointers on the device */
if (check % 2 == 0) {
d_conc_old = dev->conc_old;
d_conc_new = dev->conc_new;
} else {
d_conc_old = dev->conc_new;
d_conc_new = dev->conc_old;
}

/* set time-dependent kernel arguments */
status = clSetKernelArg(dev->boundary_kernel, 0, sizeof(cl_mem), (void *)&d_conc_old);
report_error(status, "mutable boundary args[0]");
Expand All @@ -116,17 +123,13 @@ void opencl_diffusion_solver(struct OpenCLData* dev, fp_t** conc_new,

status = clEnqueueNDRangeKernel(dev->commandQueue, dev->diffusion_kernel, 2, NULL, grid_dim, block_dim, 0, NULL, NULL);
report_error(status, "enqueue diffusion kernel");

/* swap pointers on the device */
d_conc_old = dev->conc_new;
d_conc_new = dev->conc_old;
}

*elapsed += dt * checks;

/* transfer from device out to host */
start_time = GetTimer();
status = clEnqueueReadBuffer(dev->commandQueue, d_conc_old, CL_TRUE, 0, grid_size, conc_new[0], 0, NULL, NULL);
status = clEnqueueReadBuffer(dev->commandQueue, d_conc_new, CL_TRUE, 0, grid_size, conc_new[0], 0, NULL, NULL);
report_error(status, "retrieve result from GPU");
sw->file += GetTimer() - start_time;
}
Expand Down

0 comments on commit 7c5149f

Please sign in to comment.