Skip to content
Open

Hw3 #437

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions HW3/P2/mandelbrot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,25 @@ mandelbrot(__global __read_only float *coords_real,
const int y = get_global_id(1);

float c_real, c_imag;
float z_real, z_imag;
float z_real, z_imag, z_real_temp;
int iter;

if ((x < w) && (y < h)) {
// YOUR CODE HERE
;
c_real = coords_real[x + y * w];
c_imag = coords_imag[x + y * w];
z_real = 0;
z_imag = 0;

for (iter = 0; iter < max_iter; iter++){

if ((z_real * z_real + z_imag * z_imag) > 4)
break;
z_real_temp = z_real;
z_real = c_real + (z_real * z_real) - (z_imag * z_imag);
z_imag = 2 * z_real_temp * z_imag + c_imag;
}

out_counts[x + y * w] = iter;
}
}
20 changes: 20 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
I used ubuntu virtual machine
The platforms detected are:
---------------------------
AMD Accelerated Parallel Processing Advanced Micro Devices, Inc. version: OpenCL 2.0 AMD-APP (1800.8)
The devices detected on platform AMD Accelerated Parallel Processing are:
---------------------------
Intel(R) Core(TM) i7-3689Y CPU @ 1.50GHz [Type: CPU ]
Maximum clock Frequency: 1496 MHz
Maximum allocable memory size: 1073 MB
Maximum work group size 1024
Maximum work item dimensions 3
Maximum work item size [1024, 1024, 1024]
---------------------------
This context is associated with 1 devices
The queue is using the device: Intel(R) Core(TM) i7-3689Y CPU @ 1.50GHz
The device memory bandwidth is 1.1424849539 GB/s
The host-device bandwidth is 5.03213458243 GB/s

Best:
configuration ('blocked', 8, 32): 0.013661182 seconds
33 changes: 23 additions & 10 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ __kernel void sum_coalesced(__global float* x,

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

// YOUR CODE HERE
for (uint i=get_global_id(0);i<N;i += get_global_size(0)) {
sum = sum + x[i];
}

fast[local_id] = sum;
Expand All @@ -24,8 +26,13 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
// YOUR CODE HERE
for (uint s=get_local_size(0)/2; s>0; s>>=1) {
if(local_id<s){
fast[local_id] += fast[local_id+s];
}
barrier(CLK_LOCAL_MEM_FENCE);

}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -38,8 +45,7 @@ __kernel void sum_blocked(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil(float(N) / get_global_size(0));

int k = ceil((float)N / get_global_size(0));
// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
// thread with global_id 2 should add 2k..3k-1
Expand All @@ -48,8 +54,10 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint i=get_global_id(0)*k; i<(get_global_id(0)+1)*k; i++) { // YOUR CODE HERE
if (i < N){
sum = sum + x[i];
} // YOUR CODE HERE
}

fast[local_id] = sum;
Expand All @@ -64,8 +72,13 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE

for (uint s=get_local_size(0)/2; s>0; s>>=1) { // YOUR CODE HERE
if(local_id<s){
fast[local_id] += fast[local_id+s];
}
barrier(CLK_LOCAL_MEM_FENCE);
// YOUR CODE HERE
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand Down
4 changes: 2 additions & 2 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def create_data(N):
times = {}

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand All @@ -40,7 +40,7 @@ def create_data(N):
format(num_workgroups, num_workers, seconds))

for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
local = cl.LocalMemory(num_workers * 4)
Expand Down
65 changes: 58 additions & 7 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
#include "median9.h"

static float
get_clamped_value(__global __read_only float *in_values,
int w, int h,
int x, int y){
// check left side
if (x<0) {x=0;}
// check right side
if (x>=w) {x=w-1;}
// check lower side
if (y<0) {y=0;}
//check upper side
if (y>=h) {y=h-1;}
return in_values[y*w + x];
}

// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
Expand All @@ -9,19 +24,42 @@ median_3x3(__global __read_only float *in_values,
int buf_w, int buf_h,
const int halo)
{

// Note: It may be easier for you to implement median filtering
// without using the local buffer, first, then adjust your code to
// use such a buffer after you have that working.

const int x = get_global_id(0);
const int y = get_global_id(1);

// Load into buffer (with 1-pixel halo).
//
// It may be helpful to consult HW3 Problem 5, and
// https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
//
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

// Load the relevant labels to a local buffer with a halo
if (idx_1D < buf_w) {
for (int row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] =
get_clamped_value(in_values,
w, h,
buf_corner_x + idx_1D, buf_corner_y + row);
}
}

// Make sure all threads reach the next part after
// the local buffer is loaded
barrier(CLK_LOCAL_MEM_FENCE);

// Compute 3x3 median for each pixel in core (non-halo) pixels
//
Expand All @@ -31,4 +69,17 @@ median_3x3(__global __read_only float *in_values,

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.


// write output
if ((y < h) && (x < w)) // stay in bounds
out_values[y * w + x] = median9(buffer[(buf_y-1)*buf_w + buf_x-1],
buffer[(buf_y-1)*buf_w + buf_x],
buffer[(buf_y-1)*buf_w + buf_x+1],
buffer[(buf_y)*buf_w + buf_x-1],
buffer[(buf_y)*buf_w + buf_x],
buffer[(buf_y)*buf_w + buf_x+1],
buffer[(buf_y+1)*buf_w + buf_x-1],
buffer[(buf_y+1)*buf_w + buf_x],
buffer[(buf_y+1)*buf_w + buf_x+1]);
}
7 changes: 5 additions & 2 deletions HW3/P4/median_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import division
import pyopencl as cl
import numpy as np
import imread
import pylab
import os.path
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'

def round_up(global_size, group_size):
r = global_size % group_size
Expand Down Expand Up @@ -51,7 +53,8 @@ def numpy_median(image, iterations=10):
properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

program = cl.Program(context, open('median_filter.cl').read()).build(options='')
curdir = os.path.dirname(os.path.realpath(__file__))
program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])

host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
host_image_filtered = np.zeros_like(host_image)
Expand Down
44 changes: 44 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
I failed to run P5 on my ubuntu virtual machine. (Other problems work fine)
So I ran my code on other people's Macbook.

Part 1:
maze1:
Finished after 876 iterations, 612.37928 ms total, 0.699063105023 ms per iteration
Found 2 regions
maze2:
Finished after 506 iterations, 355.05848 ms total, 0.701696600791 ms per iteration
Found 35 regions

Part 2:
Finished after 528 iterations, 375.64456 ms total, 0.711448030303 ms per iteration
Found 2 regions
maze2:
Finished after 272 iterations, 193.80496 ms total, 0.712518235294 ms per iteration
Found 35 regions

Part 3:
Finished after 10 iterations, 8.67944 ms total, 0.867944 ms per iteration
Found 2 regions
maze2:
Finished after 9 iterations, 7.70608 ms total, 0.856231111111 ms per iteration
Found 35 regions

Part 4:
Maze 1:
Finished after 10 iterations, 29.2644 ms total, 2.92644 ms per iteration
Found 2 regions
Maze 2:
Finished after 9 iterations, 26.15008 ms total, 2.90556444444 ms per iteration

According to my results, using the single thread makes the program slower.
At first, I expected the performance would be improved because in part 4, we avoided some of the redundant global memory
reads. However, I get a different result. I think it's because the memory read in GPU is relatively cheap. And since we
now use a single thread in workgroup to look up the value, the memory access have to be done serially.
The result might be different if we hae more repeated labels.

Part 5:
The atomic optimization works similarly to a lock. If we only use min instead of atomic_min, the values of label would
be read/writen by multiple threads simultaneously. However, in this case, the result might still be correct even if we
use min instead of atomic_min because the label of the same region would still be the same. But the disadvantage of min
is that the performance might be decreased because of the redundant updating caused by race condition.

47 changes: 45 additions & 2 deletions HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,63 @@ propagate_labels(__global __read_write int *labels,
old_label = buffer[buf_y * buf_w + buf_x];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
// Part 2
//if (old_label < w*h){
// buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]];
//}

// Part 4
if (lx==0 && ly==0){
// initialize last index and label
int last_idx, last_label;
// initialize current index
int cur_idx;

// loop through all core part
for (int row=0; row<buf_h; row++){
for (int col=0; col<buf_w; col++){
cur_idx = row*buf_w +col;

// avoid redundant reads
if (cur_idx == last_idx){buffer[cur_idx] = last_label;}
else{
// update everything
if(buffer[cur_idx]<w*h){
buffer[cur_idx] = labels[buffer[cur_idx]];
last_label = buffer[cur_idx];
last_idx = cur_idx;
}
}

}
}
}

// put a barrier
barrier(CLK_LOCAL_MEM_FENCE);

// stay in bounds
if ((x < w) && (y < h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.
new_label = old_label;
new_label = buffer[buf_y*buf_w +buf_x];
if (new_label < w*h){
// find the minimum of its 4 neighbooring pixels and itself, store it to new_label
new_label = min(new_label, buffer[buf_y*buf_w+buf_x-1]);
new_label = min(new_label, buffer[buf_y*buf_w+buf_x+1]);
new_label = min(new_label, buffer[(buf_y-1)*buf_w+buf_x]);
new_label = min(new_label, buffer[(buf_y+1)*buf_w+buf_x]);
}

if (new_label != old_label) {
// CODE FOR PART 3 HERE
atomic_min(&labels[old_label], new_label);
// indicate there was a change this iteration.
// multiple threads might write this.
*(changed_flag) += 1;
labels[y * w + x] = new_label;
atomic_min(&labels[y * w + x], new_label);
}
}

}