harvard-cs205 · xchang4 · Nov 3, 2015 · Nov 5, 2015 · Nov 5, 2015 · Nov 21, 2015
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
@@ -9,11 +9,25 @@ mandelbrot(__global __read_only float *coords_real,
     const int y = get_global_id(1);
 
     float c_real, c_imag;
-    float z_real, z_imag;
+    float z_real, z_imag, z_real_temp;
     int iter;
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
-        ;
+        c_real = coords_real[x + y * w];
+        c_imag = coords_imag[x + y * w];
+        z_real = 0;
+        z_imag = 0;
+
+        for (iter = 0; iter < max_iter; iter++){
+
+            if ((z_real * z_real + z_imag * z_imag) > 4)
+                break;
+            z_real_temp = z_real;
+            z_real = c_real + (z_real * z_real) - (z_imag * z_imag);
+            z_imag = 2 * z_real_temp * z_imag + c_imag;
+        }
+
+        out_counts[x + y * w] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,20 @@
+I used ubuntu virtual machine
+The platforms detected are:
+---------------------------
+AMD Accelerated Parallel Processing Advanced Micro Devices, Inc. version: OpenCL 2.0 AMD-APP (1800.8)
+The devices detected on platform AMD Accelerated Parallel Processing are:
+---------------------------
+Intel(R) Core(TM) i7-3689Y CPU @ 1.50GHz [Type: CPU ]
+Maximum clock Frequency: 1496 MHz
+Maximum allocable memory size: 1073 MB
+Maximum work group size 1024
+Maximum work item dimensions 3
+Maximum work item size [1024, 1024, 1024]
+---------------------------
+This context is associated with  1 devices
+The queue is using the device: Intel(R) Core(TM) i7-3689Y CPU @ 1.50GHz
+The device memory bandwidth is 1.1424849539 GB/s
+The host-device bandwidth is 5.03213458243 GB/s
+
+Best:
+configuration ('blocked', 8, 32): 0.013661182 seconds
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -8,8 +8,10 @@ __kernel void sum_coalesced(__global float* x,
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+
+    // YOUR CODE HERE
+    for (uint i=get_global_id(0);i<N;i += get_global_size(0)) {
+        sum = sum + x[i];
     }
 
     fast[local_id] = sum;
@@ -24,8 +26,13 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    // YOUR CODE HERE
+    for (uint s=get_local_size(0)/2; s>0; s>>=1) {
+        if(local_id<s){
+            fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,8 +45,7 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
-
+    int k = ceil((float)N / get_global_size(0));
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
     // thread with global_id 2 should add 2k..3k-1
@@ -48,8 +54,10 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint i=get_global_id(0)*k; i<(get_global_id(0)+1)*k; i++) { // YOUR CODE HERE
+        if (i < N){
+            sum = sum + x[i];
+        } // YOUR CODE HERE
     }
 
     fast[local_id] = sum;
@@ -64,8 +72,13 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    for (uint s=get_local_size(0)/2; s>0; s>>=1) { // YOUR CODE HERE
+        if(local_id<s){
+            fast[local_id] += fast[local_id+s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+         // YOUR CODE HERE
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -1,5 +1,20 @@
 #include "median9.h"
 
+static float
+get_clamped_value(__global __read_only float *in_values,
+                  int w, int h,
+                  int x, int y){
+   // check left side
+   if (x<0) {x=0;}
+   // check right side
+   if (x>=w) {x=w-1;}
+   // check lower side
+   if (y<0) {y=0;}
+   //check upper side
+   if (y>=h) {y=h-1;}
+   return in_values[y*w + x];
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -9,19 +24,42 @@ median_3x3(__global __read_only float *in_values,
            int buf_w, int buf_h,
            const int halo)
 {
+
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
-
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
     // Load into buffer (with 1-pixel halo).
-    //
-    // It may be helpful to consult HW3 Problem 5, and
-    // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
-    //
-    // Note that globally out-of-bounds pixels should be replaced
-    // with the nearest valid pixel's value.
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
 
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    // Load the relevant labels to a local buffer with a halo
+    if (idx_1D < buf_w) {
+        for (int row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] =
+                get_clamped_value(in_values,
+                                  w, h,
+                                  buf_corner_x + idx_1D, buf_corner_y + row);
+        }
+    }
+
+    // Make sure all threads reach the next part after
+    // the local buffer is loaded
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
@@ -31,4 +69,17 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+
+
+    // write output
+    if ((y < h) && (x < w)) // stay in bounds
+        out_values[y * w + x] = median9(buffer[(buf_y-1)*buf_w + buf_x-1],
+                                        buffer[(buf_y-1)*buf_w + buf_x],
+                                        buffer[(buf_y-1)*buf_w + buf_x+1],
+                                        buffer[(buf_y)*buf_w + buf_x-1],
+                                        buffer[(buf_y)*buf_w + buf_x],
+                                        buffer[(buf_y)*buf_w + buf_x+1],
+                                        buffer[(buf_y+1)*buf_w + buf_x-1],
+                                        buffer[(buf_y+1)*buf_w + buf_x],
+                                        buffer[(buf_y+1)*buf_w + buf_x+1]);
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
@@ -1,8 +1,10 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
+import os
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +53,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,44 @@
+I failed to run P5 on my ubuntu virtual machine. (Other problems work fine)
+So I ran my code on other people's Macbook.
+
+Part 1:
+maze1:
+Finished after 876 iterations, 612.37928 ms total, 0.699063105023 ms per iteration
+Found 2 regions
+maze2:
+Finished after 506 iterations, 355.05848 ms total, 0.701696600791 ms per iteration
+Found 35 regions
+
+Part 2:
+Finished after 528 iterations, 375.64456 ms total, 0.711448030303 ms per iteration
+Found 2 regions
+maze2:
+Finished after 272 iterations, 193.80496 ms total, 0.712518235294 ms per iteration
+Found 35 regions
+
+Part 3:
+Finished after 10 iterations, 8.67944 ms total, 0.867944 ms per iteration
+Found 2 regions
+maze2:
+Finished after 9 iterations, 7.70608 ms total, 0.856231111111 ms per iteration
+Found 35 regions
+
+Part 4:
+Maze 1:
+Finished after 10 iterations, 29.2644 ms total, 2.92644 ms per iteration
+Found 2 regions
+Maze 2:
+Finished after 9 iterations, 26.15008 ms total, 2.90556444444 ms per iteration
+
+According to my results, using the single thread makes the program slower.
+At first, I expected the performance would be improved because in part 4, we avoided some of the redundant global memory
+reads. However, I get a different result. I think it's because the memory read in GPU is relatively cheap. And since we
+now use a single thread in workgroup to look up the value, the memory access have to be done serially.
+The result might be different if we hae more repeated labels.
+
+Part 5:
+The atomic optimization works similarly to a lock. If we only use min instead of atomic_min, the values of label would
+be read/writen by multiple threads simultaneously. However, in this case, the result might still be correct even if we
+use min instead of atomic_min because the label of the same region would still be the same. But the disadvantage of min
+is that the performance might be decreased because of the redundant updating caused by race condition.
+
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
@@ -80,20 +80,63 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
+    // Part 2
+    //if (old_label < w*h){
+    //    buffer[buf_y * buf_w + buf_x] = labels[buffer[buf_y * buf_w + buf_x]];
+    //}
+
+    // Part 4
+    if (lx==0 && ly==0){
+        // initialize last index and label
+        int last_idx, last_label;
+        // initialize current index
+        int cur_idx;
+
+        // loop through all core part
+        for (int row=0; row<buf_h; row++){
+            for (int col=0; col<buf_w; col++){
+                cur_idx = row*buf_w +col;
+
+                // avoid redundant reads
+                if (cur_idx == last_idx){buffer[cur_idx] = last_label;}
+                else{
+                    // update everything
+                    if(buffer[cur_idx]<w*h){
+                        buffer[cur_idx] = labels[buffer[cur_idx]];
+                        last_label = buffer[cur_idx];
+                        last_idx = cur_idx;
+                    }
+                }
+
+            }
+        }
+    }
+
+    // put a barrier
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // stay in bounds
     if ((x < w) && (y < h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
+        new_label = buffer[buf_y*buf_w +buf_x];
+        if (new_label < w*h){
+            // find the minimum of its 4 neighbooring pixels and itself, store it to new_label
+            new_label = min(new_label, buffer[buf_y*buf_w+buf_x-1]);
+            new_label = min(new_label, buffer[buf_y*buf_w+buf_x+1]);
+            new_label = min(new_label, buffer[(buf_y-1)*buf_w+buf_x]);
+            new_label = min(new_label, buffer[(buf_y+1)*buf_w+buf_x]);
+        }
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
+            atomic_min(&labels[old_label], new_label);
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+            atomic_min(&labels[y * w + x], new_label);
         }
     }
+
 }