diff --git a/HW2/P2/P2.py b/HW2/P2/P2.py
index 697bcd0a..701d2fc0 100644
--- a/HW2/P2/P2.py
+++ b/HW2/P2/P2.py
@@ -10,6 +10,7 @@
 import numpy as np
 from timer import Timer
 from parallel_vector import move_data_serial, move_data_fine_grained, move_data_medium_grained
+import matplotlib.pyplot as plt
 
 if __name__ == '__main__':
     ########################################
@@ -21,6 +22,7 @@
 
     total = orig_counts.sum()
 
+
     # serial move
     counts = orig_counts.copy()
     with Timer() as t:
@@ -29,23 +31,66 @@
     print("Serial uncorrelated: {} seconds".format(t.interval))
     serial_counts = counts.copy()
 
-    # fine grained
-    counts[:] = orig_counts
-    with Timer() as t:
-        move_data_fine_grained(counts, src, dest, 100)
-    assert counts.sum() == total, "Wrong total after move_data_fine_grained"
-    print("Fine grained uncorrelated: {} seconds".format(t.interval))
+    ### fine grained, 4 threads, uncorrelated, multilock ####
+    # counts[:] = orig_counts
+    # with Timer() as t:
+    #     move_data_fine_grained(counts, src, dest, 100, 4)
+    # assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+    # print("Fine grained uncorrelated: {} seconds".format(t.interval))
+    # for t in threads:
+        
+    ### fine grained, multithread, uncorrelated ###
+    # Save output for multiple Threads
+    threads = range(1,10)
+    time_fine_threads_uncor = []
+
+    for th in threads:
+        counts[:] = orig_counts
+        with Timer() as t:
+            move_data_fine_grained(counts, src, dest, 100, th)
+        assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+        print("Number of Threads: {}".format(th))
+        print("Fine grained uncorrelated: {} seconds".format(t.interval))
+        time_fine_threads_uncor.append(t.interval)
 
     ########################################
     # You should explore different values for the number of locks in the medium
     # grained locking
     ########################################
-    N = 10
-    counts[:] = orig_counts
-    with Timer() as t:
-        move_data_medium_grained(counts, src, dest, 100, N)
-    assert counts.sum() == total, "Wrong total after move_data_medium_grained"
-    print("Medium grained uncorrelated: {} seconds".format(t.interval))
+    
+    ### Medium grained, 4 threads, uncorrelated multi-lock ###
+    # Create a range of N values for Part 1
+    N_buffer = np.arange(1,21)
+
+    # Create a few N values
+    N_pts = [1,2,4,5,10]
+
+    # Save Medium grain time results
+    output_time_mg_uncor = []
+    output_time_mg_uncor_pts = []
+    
+    # for N in N_buffer:
+    #     counts[:] = orig_counts
+    #     with Timer() as t:
+    #         move_data_medium_grained(counts, src, dest, 100, N, 4)
+    #     assert counts.sum() == total, "Wrong total after move_data_medium_grained"
+    #     if (N == N_pts).any():
+    #         print("Number of Locks: {}".format(N))
+    #         print("Medium grained uncorrelated: {} seconds".format(t.interval))
+    #         output_time_mg_uncor_pts.append(t.interval)
+    #     output_time_mg_uncor.append(t.interval)
+
+    ### Medium Grain, Multithread, Uncorrelated ###
+    time_med_threads_uncor = []
+    for th in threads:
+        N=10
+        counts[:] = orig_counts
+        with Timer() as t:
+            move_data_medium_grained(counts, src, dest, 100, N, th)
+        assert counts.sum() == total, "Wrong total after move_data_medium_grained"
+        print("Number of Threads: {}".format(th))
+        print("Medium grained uncorrelated: {} seconds".format(t.interval))
+        time_med_threads_uncor.append(t.interval)
 
     ########################################
     # Now use correlated data movement
@@ -62,21 +107,85 @@
     assert counts.sum() == total, "Wrong total after move_data_serial"
     print("Serial correlated: {} seconds".format(t.interval))
     serial_counts = counts.copy()
+    
+    ### fine grained, 4 threads, correlated, multilock ####
+    # counts[:] = orig_counts
+    # with Timer() as t:
+    #     move_data_fine_grained(counts, src, dest, 100, 4)
+    # assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+    # print("Fine grained correlated: {} seconds".format(t.interval))
 
-    # fine grained
-    counts[:] = orig_counts
-    with Timer() as t:
-        move_data_fine_grained(counts, src, dest, 100)
-    assert counts.sum() == total, "Wrong total after move_data_fine_grained"
-    print("Fine grained correlated: {} seconds".format(t.interval))
-
+    ### fine grained, multithread, correlated ###
+    time_fine_threads_cor = []
+    for th in threads:
+        counts[:] = orig_counts
+        with Timer() as t:
+            move_data_fine_grained(counts, src, dest, 100, th)
+        assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+        print("Number of Locks: {}".format(th))
+        print("Fine grained uncorrelated: {} seconds".format(t.interval))
+        time_fine_threads_cor.append(t.interval)
+    
     ########################################
     # You should explore different values for the number of locks in the medium
     # grained locking
     ########################################
-    N = 10
-    counts[:] = orig_counts
-    with Timer() as t:
-        move_data_medium_grained(counts, src, dest, 100, N)
-    assert counts.sum() == total, "Wrong total after move_data_medium_grained"
-    print("Medium grained correlated: {} seconds".format(t.interval))
+    
+    ### medium grained, 4 threads, correlated, multilock ####
+    # output_time_mg_cor = []
+    # output_time_mg_cor_pts = []
+    # for N in N_buffer:
+    #     counts[:] = orig_counts
+    #     with Timer() as t:
+    #         move_data_medium_grained(counts, src, dest, 100, N, 4)
+    #     assert counts.sum() == total, "Wrong total after move_data_medium_grained"
+    #     if (N == N_pts).any():
+    #         print("Number of Locks: {}".format(N))
+    #         print("Medium grained correlated: {} seconds".format(t.interval))
+    #         output_time_mg_cor_pts.append(t.interval)
+    #     output_time_mg_cor.append(t.interval)
+    
+
+    ### Medium Grain, Multithread, Correlated ###
+    time_med_threads_cor = []
+    N=10
+    for th in threads:
+        counts[:] = orig_counts
+        with Timer() as t:
+            move_data_medium_grained(counts, src, dest, 100, N, th)
+        assert counts.sum() == total, "Wrong total after move_data_medium_grained"
+        print("Number of Threads: {}".format(th))
+        print("Medium grained uncorrelated: {} seconds".format(t.interval))
+        time_med_threads_cor.append(t.interval)
+
+
+    ### Multi-N ploting ####
+    # plt.figure(figsize=(10,8))
+    # plt.plot(N_buffer, output_time_mg_uncor)
+    # plt.plot(N_buffer, output_time_mg_cor)
+    # plt.scatter(N_pts, output_time_mg_uncor_pts, s=50, c='Red', label=u'UnCorrelated')
+    # plt.scatter(N_pts, output_time_mg_cor_pts, s=50, c='Green', label=u'Correlated')
+    # plt.title("Time of Array Suffle Correlated vs. Uncorrelated (Medium Grain Locking)")
+    # plt.xlabel("N")
+    # plt.ylabel("Completation Time")
+    # plt.legend(loc=2)
+    # plt.show()
+
+    ### Multithread ploting ####
+    plt.figure(figsize=(10,8))
+    
+    # Uncorrelated 
+    plt.plot(threads, time_fine_threads_uncor, label=u'Fine Uncorrelated')
+    plt.plot(threads, time_med_threads_uncor, label=u'Medium Uncorrelated')
+    
+    # Correlated
+    plt.plot(threads, time_fine_threads_cor, label=u'Fine Correlated')
+    plt.plot(threads, time_med_threads_cor, label=u'Medium Correlated')
+    
+    # plt.scatter(N_pts, output_time_mg_uncor_pts, s=50, c='Red', label=u'UnCorrelated')
+    # plt.scatter(N_pts, output_time_mg_cor_pts, s=50, c='Green', label=u'Correlated')
+    plt.title("Time of Multithreaded Array Suffle Correlated vs. Uncorrelated")
+    plt.xlabel("Threads")
+    plt.ylabel("Completation Time")
+    plt.legend(loc=2)
+    plt.show()    
diff --git a/HW2/P2/P2.txt b/HW2/P2/P2.txt
new file mode 100644
index 00000000..3d257b37
--- /dev/null
+++ b/HW2/P2/P2.txt
@@ -0,0 +1,42 @@
+### Part A ###
+
+# Uncorrelated
+Serial uncorrelated: 0.327378034592 seconds
+Fine grained uncorrelated: 8.42167806625 seconds
+
+Number of Locks: 1
+Medium grained uncorrelated: 9.59998893738 seconds
+Number of Locks: 2
+Medium grained uncorrelated: 10.9623169899 seconds
+Number of Locks: 4
+Medium grained uncorrelated: 11.0328228474 seconds
+Number of Locks: 5
+Medium grained uncorrelated: 11.3029088974 seconds
+Number of Locks: 10
+Medium grained uncorrelated: 9.50029206276 seconds
+
+
+# Correlated 
+Serial correlated: 0.472969055176 seconds
+Fine grained correlated: 7.73167490959 seconds
+
+Number of Locks: 1
+Medium grained correlated: 9.56373286247 seconds
+Number of Locks: 2
+Medium grained correlated: 8.43163609505 seconds
+Number of Locks: 4
+Medium grained correlated: 8.61915707588 seconds
+Number of Locks: 5
+Medium grained correlated: 8.97014594078 seconds
+Number of Locks: 10
+Medium grained correlated: 11.4742889404 seconds
+
+For this part of the problem, we fixed the number of threads to 4 and changed the amount of locks available to the medium grained data move. We can see from the plotted results that there is not much difference in execution time for the uncorrelated data move, i.e. the amount of data we lock for each thread does not change the performance of the data move. For the correlated data, we see that additional locks increase the performance. This makes sense as we increase the number of locks, there is additional data for threads to operate. The ideal amount of locks should be N=20 as this would ensure that the source and destination are no more than 10 elements away from one another in either direction. However, my results tell a different story. There is a bad performance "spikes" at N = 10 and 20. Based on the results I would pick N between 10 and 20.
+
+### Part B ###
+
+Serial uncorrelated: 0.36355805397 seconds
+Serial correlated: 0.428131818771 seconds
+
+In part B we were required to fix the number of locks to 10 and see how performance varies as we enumerate the amount of the threads available to move data. From the plot we can see that there is a point of diminishing returns after 4 threads. The performance is improved by 4 threads but decreases very quickly as we increase the number of threads. Moreover, the threaded results are much worse than the serial results, this is due to the communication needed between threads to move the data.
+
diff --git a/HW2/P2/P2_multilock.png b/HW2/P2/P2_multilock.png
new file mode 100644
index 00000000..1fb22b0e
Binary files /dev/null and b/HW2/P2/P2_multilock.png differ
diff --git a/HW2/P2/P2_mutlithread.png b/HW2/P2/P2_mutlithread.png
new file mode 100644
index 00000000..3f13084c
Binary files /dev/null and b/HW2/P2/P2_mutlithread.png differ
diff --git a/HW2/P2/parallel_vector.pyx b/HW2/P2/parallel_vector.pyx
index 5d67797f..900f2041 100644
--- a/HW2/P2/parallel_vector.pyx
+++ b/HW2/P2/parallel_vector.pyx
@@ -1,12 +1,10 @@
 # turn off bounds checking & wraparound for arrays
-#cython: boundscheck=False, wraparound=False
+# cython: boundscheck=False, wraparound=False
 
 ##################################################
 # setup and helper code
 ##################################################
-
-
-from cython.parallel import parallel, prange
+from cython.parallel import parallel, prange, threadid
 from openmp cimport omp_lock_t, \
     omp_init_lock, omp_destroy_lock, \
     omp_set_lock, omp_unset_lock, omp_get_thread_num
@@ -15,7 +13,6 @@ from libc.stdlib cimport malloc, free
 import numpy as np
 cimport numpy as np
 
-
 # lock helper functions
 cdef void acquire(omp_lock_t *l) nogil:
     omp_set_lock(l)
@@ -45,11 +42,71 @@ cdef void free_N_locks(int N, omp_lock_t *locks) nogil:
 
     free(<void *> locks)
 
+# My function for checking lock conditions for fine grain
+cdef void check_lock(int idx, omp_lock_t *locks, np.int32_t[:] counts, 
+    np.int32_t[:] src, np.int32_t[:] dest) nogil:
+    
+    # If the source index is greater, grab it's lock first
+    # This prevents deadlocking
+    if src[idx] > dest[idx]:
+        acquire(&locks[src[idx]])
+        acquire(&locks[dest[idx]])
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]])
+        release(&locks[dest[idx]])
+
+    # If the destination's index is greater, grab it first
+    # Also prevents deadlocking
+    elif src[idx] < dest[idx]:
+        acquire(&locks[dest[idx]])
+        acquire(&locks[src[idx]])
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]])
+        release(&locks[dest[idx]])
+
+    # If the indexs are equal, only grab one lock
+    # This prevents double locking   
+    else:
+        acquire(&locks[src[idx]]) 
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]])
+
+
+# My Function Updated for course grain locking
+cdef void check_lock_med(int idx, int N, omp_lock_t *locks, np.int32_t[:] counts, 
+    np.int32_t[:] src, np.int32_t[:] dest) nogil:
+    
+    # We must now check for the index/N as that cooresponds to the
+    # number of locks that are available
+    if src[idx]/N > dest[idx]/N:
+        acquire(&locks[src[idx]/N])
+        acquire(&locks[dest[idx]/N])
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]/N])
+        release(&locks[dest[idx]/N])
+
+    elif src[idx]/N < dest[idx]/N:
+        acquire(&locks[dest[idx]/N])
+        acquire(&locks[src[idx]/N])
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]/N])
+        release(&locks[dest[idx]/N])
+
+    else:
+        acquire(&locks[src[idx]/N]) 
+        counts[dest[idx]] += 1
+        counts[src[idx]] -= 1
+        release(&locks[src[idx]/N])
+
 
 ##################################################
 # Your code below
 ##################################################
-
 cpdef move_data_serial(np.int32_t[:] counts,
                        np.int32_t[:] src,
                        np.int32_t[:] dest,
@@ -65,11 +122,12 @@ cpdef move_data_serial(np.int32_t[:] counts,
                    counts[dest[idx]] += 1
                    counts[src[idx]] -= 1
 
-
+# Updated move_data_fine_grained for parallel implementation
 cpdef move_data_fine_grained(np.int32_t[:] counts,
                              np.int32_t[:] src,
                              np.int32_t[:] dest,
-                             int repeat):
+                             int repeat,
+                             int threads):
    cdef:
        int idx, r
        omp_lock_t *locks = get_N_locks(counts.shape[0])
@@ -79,12 +137,11 @@ cpdef move_data_fine_grained(np.int32_t[:] counts,
    # Use parallel.prange() and a lock for each element of counts to parallelize
    # data movement.  Be sure to avoid deadlock, and double-locking.
    ##########
-   with nogil:
-       for r in range(repeat):
-           for idx in range(src.shape[0]):
-               if counts[src[idx]] > 0:
-                   counts[dest[idx]] += 1
-                   counts[src[idx]] -= 1
+
+   for r in xrange(repeat):
+       for idx in prange(src.shape[0], nogil=True, num_threads=threads): 
+           if counts[src[idx]] > 0:
+               check_lock(idx, locks, counts, src, dest)
 
    free_N_locks(counts.shape[0], locks)
 
@@ -93,7 +150,8 @@ cpdef move_data_medium_grained(np.int32_t[:] counts,
                                np.int32_t[:] src,
                                np.int32_t[:] dest,
                                int repeat,
-                               int N):
+                               int N,
+                               int threads):
    cdef:
        int idx, r
        int num_locks = (counts.shape[0] + N - 1) / N  # ensure enough locks
@@ -105,11 +163,11 @@ cpdef move_data_medium_grained(np.int32_t[:] counts,
    # to parallelize data movement.  Be sure to avoid deadlock, as well as
    # double-locking.
    ##########
-   with nogil:
-       for r in range(repeat):
-           for idx in range(src.shape[0]):
-               if counts[src[idx]] > 0:
-                   counts[dest[idx]] += 1
-                   counts[src[idx]] -= 1
-
-   free_N_locks(num_locks, locks)
+   
+   for r in xrange(repeat):
+       for idx in prange(src.shape[0], nogil=True, num_threads=threads):
+           if counts[src[idx]] > 0:
+               check_lock_med(idx, N, locks, counts, src, dest)
+
+   free_N_locks(num_locks, locks)          
+   
diff --git a/HW2/P3/P3.txt b/HW2/P3/P3.txt
new file mode 100644
index 00000000..bef331c0
--- /dev/null
+++ b/HW2/P3/P3.txt
@@ -0,0 +1,26 @@
+P3.txt
+
+### Serial Implementation ###
+1074.656613 Million Complex FMAs in 6.37947392464 seconds, 168.455365708 million Complex FMAs / second
+
+### Multithread Implementation ###
+1 Thread
+1074.656613 Million Complex FMAs in 6.59998703003 seconds, 162.827079525 million Complex FMAs / second
+2 Threads
+1074.656613 Million Complex FMAs in 3.32439684868 seconds, 323.263636057 million Complex FMAs / second
+4 Threads
+1074.656613 Million Complex FMAs in 1.87254285812 seconds, 573.902278571 million Complex FMAs / second
+
+### AVX Implementation ###
+1 Thread
+1074.656613 Million Complex FMAs in 1.8737821579 seconds, 573.522705652 million Complex FMAs / second
+2 Threads
+1074.957423 Million Complex FMAs in 0.47437620163 seconds, 2266.04416349 million Complex FMAs / second
+4 Threads
+1074.957423 Million Complex FMAs in 0.386723995209 seconds, 2779.65018028 million Complex FMAs / second
+
+In this problem we worked with the infamous Mandelbrot data set which we have become so familiar with over the past month, and the task was to implement the calculation first using multithreading, then instruction-level parallelism. In the multithread implementation, we can observe that there is performance decrease when comparing the serial version to the 1 threaded solution. This once again highlights the communication costs of using a threaded approach. However, once the number of thread are increased, a significant speed up in the calculation is achieved. By doubling the amount of threads, the same number of complex FMAs are performed in approximately half the time. In the 4 thread implementation, the performance is improved by x3.5. This shows that adding threads to a ridiculous parallel problem does in fact increase the performance.
+
+In the instruction-level parallelism portion of this problem, we were asked to implement the Mandelbrot calculation using 8-way instruction-level parallelism via AVX. The performance boost achieved by implementing SIMD is outstanding. The 8-way parallel implementation with just one thread is almost 4 times faster than the serial version. Using 4 threads, there is a speed up of ~x16!
+
+AVX was quite an adventure for me, I had no experience using SIMD but I have previous programmed in assembly. I have to say that it was very difficult, but also very rewarding. It's obvious that there are real performance benefits associated with AVX, but it comes with a price, which is a complex program. I had to completely change the way I thought about the problem and really consider every implementation decision that I made. The discovery of both fmadd and fmsub turned out to be very helpful for the real and imaginary z calculations. I spent a few office hour sessions walking through this calculation and I have to thank both Kevin, Phillipe and Adi for their assistance. 
\ No newline at end of file
diff --git a/HW2/P3/driver.py b/HW2/P3/driver.py
index 791eda47..b7ab9b4c 100644
--- a/HW2/P3/driver.py
+++ b/HW2/P3/driver.py
@@ -35,4 +35,4 @@ def make_coords(center=(-0.575 - 0.575j),
     print("{} Million Complex FMAs in {} seconds, {} million Complex FMAs / second".format(out_counts.sum() / 1e6, seconds, (out_counts.sum() / seconds) / 1e6))
 
     plt.imshow(np.log(out_counts))
-    plt.show()
+    plt.show()
\ No newline at end of file
diff --git a/HW2/P3/mandelbrot.pyx b/HW2/P3/mandelbrot.pyx
index a019cfa2..2629d7e2 100644
--- a/HW2/P3/mandelbrot.pyx
+++ b/HW2/P3/mandelbrot.pyx
@@ -5,19 +5,18 @@ import numpy
 cimport AVX
 from cython.parallel import prange
 
-
 cdef np.float64_t magnitude_squared(np.complex64_t z) nogil:
     return z.real * z.real + z.imag * z.imag
 
+# cdef np.float32_t avx_mag_square(np.ndarray[np.float32_t, ndim=8] avx_z):
+#     return AVX.sqrt(avx_z)
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 cpdef mandelbrot(np.complex64_t [:, :] in_coords,
                  np.uint32_t [:, :] out_counts,
                  int max_iterations=511):
-    cdef:
-       int i, j, iter
-       np.complex64_t c, z
-
+    
        # To declare AVX.float8 variables, use:
        # cdef:
        #     AVX.float8 v1, v2, v3
@@ -31,6 +30,18 @@ cpdef mandelbrot(np.complex64_t [:, :] in_coords,
     assert in_coords.shape[0] == out_counts.shape[0], "Input and output arrays must be the same size"
     assert in_coords.shape[1] == out_counts.shape[1],  "Input and output arrays must be the same size"
 
+    #out_counts = serial_mandel(in_coords, out_counts, max_iterations)
+    #out_counts = mandel_multi(in_coords, out_counts, 4, max_iterations)
+    out_counts = mandel_AVX(in_coords, out_counts, 4, max_iterations)
+
+cpdef serial_mandel(np.complex64_t [:, :] in_coords, 
+                   np.uint32_t [:, :] out_counts,
+                   int max_iterations=511):
+    cdef:
+       int i, j, iter
+       np.complex64_t c, z
+
+    # Serial Code
     with nogil:
         for i in range(in_coords.shape[0]):
             for j in range(in_coords.shape[1]):
@@ -41,17 +52,117 @@ cpdef mandelbrot(np.complex64_t [:, :] in_coords,
                         break
                     z = z * z + c
                 out_counts[i, j] = iter
+    
+    return out_counts
 
-
+cpdef mandel_multi(np.complex64_t [:, :] in_coords, 
+                   np.uint32_t [:, :] out_counts,
+                   int threads,
+                   int max_iterations=511):
+    cdef:
+        int i, j, iter
+        np.complex64_t c, z
+
+    # Completed Multithreading Code
+    for i in prange(in_coords.shape[0], nogil=True, schedule='static', chunksize=1, num_threads=threads):
+        for j in xrange(in_coords.shape[1]):
+            c = in_coords[i, j]
+            z = 0
+            for iter in xrange(max_iterations):
+                if magnitude_squared(z) > 4:
+                    break
+                z = z * z + c
+            out_counts[i, j] = iter
+
+    return out_counts
+
+cpdef mandel_AVX(np.complex64_t [:, :] in_coords,
+                 np.uint32_t [:, :] out_counts,
+                 int threads,
+                 int max_iterations=511):
+    
+    cdef:
+        AVX.float8 real_avx, imag_avx, complex_avx
+        AVX.float8 z_real, z_imag, z_r_tmp
+        AVX.float8 mag_real_avx, mag_imag_avx, mag_tot_avx
+        AVX.float8 cutoff_avx, two_avx, one_avx, mask_avx, compare_avx, count_avx, c_plus_one
+
+        np.float32_t [:,:] real_np, imag_np
+
+        int i, j, k, iter
+
+    # Convert Complex to real and image float32 arrays
+    real_np = np.real(in_coords)
+    imag_np = np.imag(in_coords)
+  
+    # Initialize the real and imaginary AVX floats
+    real_avx = AVX.float_to_float8(0)
+    imag_avx = AVX.float_to_float8(0)
+    
+    # Initialize the magnitude vectors for real, imag, and output
+    mag_real_avx = AVX.float_to_float8(0)
+    mag_imag_avx = AVX.float_to_float8(0)
+    mag_tot_avx = AVX.float_to_float8(0)
+    
+    # Initialize cutoff=4, two=2, one=1, compare and mask AVX vectors
+    cutoff_avx = AVX.float_to_float8(4)
+    two_avx = AVX.float_to_float8(2)
+    one_avx = AVX.float_to_float8(1)
+    compare_avx = AVX.float_to_float8(0)
+    mask_avx = AVX.float_to_float8(0)
+    
+    for i in prange(in_coords.shape[0], nogil=True, schedule='static', chunksize=1, num_threads=threads):
+    #for i in xrange(in_coords.shape[0]):
+        for j in range(0, in_coords.shape[1], 8):
+            
+            # Initialize the Real and Imag parts of the AVX values
+            # c = in_coords[i, j]
+            real_avx = AVX.make_float8(real_np[i, j], real_np[i, j+1], real_np[i, j+2], real_np[i, j+3],
+              real_np[i, j+4], real_np[i, j+5], real_np[i, j+6], real_np[i, j+7])
+            imag_avx = AVX.make_float8(imag_np[i, j], imag_np[i, j+1], imag_np[i, j+2], imag_np[i, j+3],
+              imag_np[i, j+4], imag_np[i, j+5], imag_np[i, j+6], imag_np[i, j+7])
+
+            # z = 0
+            z_real = AVX.float_to_float8(0)
+            z_imag = AVX.float_to_float8(0)
+            z_r_tmp = AVX.float_to_float8(0)
+            count_avx = AVX.float_to_float8(0)
+            
+            for iter in range(max_iterations):
+                # Compute Magnitude 
+                mag_real_avx = AVX.mul(z_real, z_real)
+                mag_tot_avx = AVX.fmadd(z_imag, z_imag, mag_real_avx)
+               
+                # Only modify AVX values if less than 4
+                compare_avx = AVX.less_than(mag_tot_avx, cutoff_avx)
+                mask_avx = AVX.bitwise_and(one_avx, compare_avx)
+                count_avx = AVX.add(count_avx, mask_avx)
+
+                # Break if all values are great than 4
+                if not AVX.signs(compare_avx):
+                  break
+
+                # Update Z_real and Z_complex
+                z_r_tmp = AVX.fmsub(z_real, z_real, AVX.fmsub(z_imag, z_imag, real_avx))
+                z_imag = AVX.fmadd(two_avx, AVX.mul(z_real, z_imag), imag_avx)
+                z_real = z_r_tmp
+            
+            # Update outcounts
+            for k in range(8):
+                out_counts[i,j+k] = <np.uint32_t> ((<np.float32_t*> &count_avx)[k])
+
+    return out_counts
 
 # An example using AVX instructions
 cpdef example_sqrt_8(np.float32_t[:] values):
     cdef:
-        AVX.float8 avxval, tmp, mask
-        float out_vals[8]
+        AVX.float8 avxval, tmp, mask, new
+        float out_vals[8], out_new[8]
         float [:] out_view = out_vals
+        float [:] new_view = out_new
 
     assert values.shape[0] == 8
+    new = AVX.float_to_float8(5)
 
     # Note that the order of the arguments here is opposite the direction when
     # we retrieve them into memory.
@@ -73,5 +184,9 @@ cpdef example_sqrt_8(np.float32_t[:] values):
     avxval = AVX.bitwise_andnot(mask, avxval)
 
     AVX.to_mem(avxval, &(out_vals[0]))
+    AVX.to_mem(new, &(out_new[0]))
+    
+    print("Out View:", np.array(out_view))
+    print("New:", np.array(new_view))
 
     return np.array(out_view)
diff --git a/HW2/P4/P4.txt b/HW2/P4/P4.txt
new file mode 100644
index 00000000..8ecfae77
--- /dev/null
+++ b/HW2/P4/P4.txt
@@ -0,0 +1,14 @@
+I referenced the following webpage for this problem and collaborated with Patrick Kuiper:
+https://pymotw.com/2/threading/
+
+Original Run
+5.32677793503 seconds for 10 filter passes.
+
+Threaded Code Runs
+5.90390110016 seconds for 10 filter passes and 1 threads
+3.10576891899 seconds for 10 filter passes and 2 threads
+2.29144191742 seconds for 10 filter passes and 4 threads
+
+In this problem we discovered how to implement python threads using events objects. I used the events object call to keep track of each thread’s progress for every iteration of the image filter. Events allow for threads to keep track of one another and to sync up between iterations. I found the "set()" logic to be counter intuitive as you set the current processing thread, then alert it's neighbor threads that they must "wait()" until each other have completed their calculation. I originally thought that this syntax would be the other way around, which led to many hours of python event fun!
+
+Based on my results, one can see that adding more threads does speed the median filter algorithm up. However, there is an overhead associated with establishing threads as the single thread execution took longer than the no thread original test.
\ No newline at end of file
diff --git a/HW2/P4/driver.py b/HW2/P4/driver.py
index 0051a121..a0ea23e2 100644
--- a/HW2/P4/driver.py
+++ b/HW2/P4/driver.py
@@ -27,6 +27,63 @@ def py_median_3x3(image, iterations=10, num_threads=1):
 
     return tmpA
 
+
+def worker(events, thread_id, thread_num, iterations, orig_imag, new_imag):
+    for i in range(iterations):
+        
+        # Do nothing for the first iteration, however still needed for i-1
+        if i == 0:
+            pass
+
+        # Do nothing if there is only one thread
+        elif thread_num == 1:
+            pass
+        
+        # Corner case first thread only has 1 dependent thread
+        elif thread_id == 0:
+            events[thread_id+1, i-1].wait()
+        
+        # Corner case last thread only has 1 dependent thread
+        elif thread_id == thread_num-1:
+            events[thread_id-1, i-1].wait()
+
+        # All other threads have 2 dependents 
+        # Tell both previous and next to wait on current thread
+        else:
+            events[thread_id-1, i-1].wait()
+            events[thread_id+1, i-1].wait()
+
+        # Run provided cython function to filter
+        filtering.median_3x3(orig_imag, new_imag, thread_id, thread_num)
+
+        # Set current thread ID so that dependent threads know to wait
+        events[thread_id, i].set()
+
+        # Simultaneous change original image and new filtered image
+        orig_imag, new_imag = new_imag, orig_imag
+
+def py_median_3x3_threaded(image, iterations, num_threads):
+    ''' filter with a 3x3 median threading '''
+    tmpA = image.copy()
+    tmpB = np.empty_like(tmpA)
+
+    # Create the event array for all threads and every iteration
+    events = np.array([[threading.Event()]*iterations]*num_threads)
+
+    # Create Threads given the worker function and start
+    for t in range(num_threads):
+        threads = threading.Thread(target=worker, args=(events, t, num_threads, iterations, tmpA, tmpB))
+        threads.start()
+    
+    # Join Threads to ensure all threads have completed their work
+    threads.join()
+
+    # If odd number of interations return empty image
+    if iterations % 2 == 1:
+        return tmpB
+    else:
+        return tmpA
+     
 def numpy_median(image, iterations=10):
     ''' filter using numpy '''
     for i in range(iterations):
@@ -42,26 +99,47 @@ def numpy_median(image, iterations=10):
 if __name__ == '__main__':
     input_image = np.load('image.npz')['image'].astype(np.float32)
 
-    pylab.gray()
+    # pylab.gray()
 
-    pylab.imshow(input_image)
-    pylab.title('original image')
+    # pylab.imshow(input_image)
+    # pylab.title('original image')
 
-    pylab.figure()
-    pylab.imshow(input_image[1200:1800, 3000:3500])
-    pylab.title('before - zoom')
+    # pylab.figure()
+    # pylab.imshow(input_image[1200:1800, 3000:3500])
+    # pylab.title('before - zoom')
 
     # verify correctness
-    from_cython = py_median_3x3(input_image, 2, 5)
-    from_numpy = numpy_median(input_image, 2)
-    assert np.all(from_cython == from_numpy)
+    # from_cython = py_median_3x3(input_image, 2, 5)
+    # from_numpy = numpy_median(input_image, 2)
+    # assert np.all(from_cython == from_numpy)
 
     with Timer() as t:
-        new_image = py_median_3x3(input_image, 10, 8)
+        new_image = py_median_3x3(input_image, 10, 1)
 
-    pylab.figure()
-    pylab.imshow(new_image[1200:1800, 3000:3500])
-    pylab.title('after - zoom')
+    # pylab.figure()
+    # pylab.imshow(new_image[1200:1800, 3000:3500])
+    # pylab.title('after - zoom')
 
     print("{} seconds for 10 filter passes.".format(t.interval))
-    pylab.show()
+    # pylab.show()
+
+    ### My Code ###
+    from_cython = py_median_3x3_threaded(input_image, 2, 5)
+    from_numpy = numpy_median(input_image, 2)
+    assert np.all(from_cython == from_numpy), "Images don't match!"
+
+    with Timer() as time1:
+        threaded_image = py_median_3x3_threaded(input_image, 10, 1)
+
+    with Timer() as time2:
+        threaded_image = py_median_3x3_threaded(input_image, 10, 2)
+    
+    with Timer() as time4:
+        threaded_image = py_median_3x3_threaded(input_image, 10, 4)
+
+    results = [(1, time1), (2, time2), (4, time4)]
+
+    for t, r in results:
+        print("{} seconds for 10 filter passes and {} threads".format(r.interval, t))
+
+
diff --git a/HW2/P5/P5.txt b/HW2/P5/P5.txt
new file mode 100644
index 00000000..466ccae5
--- /dev/null
+++ b/HW2/P5/P5.txt
@@ -0,0 +1,42 @@
+P5.txt
+
+### SUBPROBLEM 1: multithreading ###
+
+# Original Implementation
+10 frame update average ~ 8.4886844904079997 simulation frames per second
+# 1 Thread
+10 frame update average ~ 8.9781141036150025 simulation frames per second
+# 4 threads
+10 frame update average ~ 13.478761244959998 simulation frames per second
+
+For subproblem 1, we updated the physics engine with multithreading (via a prange in update function). We can see from the average simulation frames per second that adding additional threads for computation slightly increases the performance of the program. However, these updates are not substantial and as we discovered later in the problem, there is room for improvement.
+
+### SUBPROBLEM 2: Spatial decomposition ###
+
+# 1 Thread
+10 frame update average ~ 327.77711671729998 simulation frames per second
+# 4 Threads
+10 frame update average ~ 488.2193716649 simulation frames per second
+
+For subproblem 2, we see a huge increase in performance due to the spatial decomposition. The SD was achieved by finding the position of the balls in the grid and only searching in a small box around the ball of interest instead of searching all balls in space. This significantly cuts down the amount of comparisons needed to see if balls collide and there are performance benefits from doing so.
+
+### SUBPROBLEM 3: Spatially Coherent Sorting ###
+
+# 1 Thread
+10 frame update average ~ 539.95193751989996 simulation frames per second
+# 4 Threads
+10 frame update average ~ 728.73026065850001 simulation frames per second
+
+In subproblem 3, we were asked to arrange the balls in to Morton/Zorder or Hilbert spacing in order for balls that are near each other in the grid to be located near each in memory. I choose to implement a morton curve as I wasn't familiar with either Morton or Hilbert and the Morton/Zorder curve seemed easier to implement. The Zorder is calculated by interleaving bits for 2 positions together (e.g. [0,1] and [1,0] would become [0110]) then sorting based on this new representation. We see a performance increase due to the fact that the balls are closer together in memory and therefore we don't have to go as far in memory for the comparison.
+
+I used the following website as a reference and collaborated with Patrick Kupier on this subproblem:
+http://www.thejach.com/view/2011/9/playing_with_morton_numbers
+
+### SUBPROBLEM 4: Locks ###
+
+# 1 Thread
+10 frame update average ~ 404.76805630230001 simulation frames per second
+# 4 Threads
+10 frame update average ~ 644.6591968916 simulation frames per second
+
+In subproblem 4, we were tasked to implement locks in order to ensure that the multiple threads weren't altering the positions of the balls. Since the threads are implementing locks in a fine grain fashion, we actually realize degraded performance. Although the performance of the program is decreased, our program actually becomes more realistic as we are not updating collisions multiple times for the same collision. In the end, our parallel implementation is approx. 90x faster than the serial implementation. Parallel coding is the way to go!
\ No newline at end of file
diff --git a/HW2/P5/driver.py b/HW2/P5/driver.py
index fdd1a102..69727a62 100644
--- a/HW2/P5/driver.py
+++ b/HW2/P5/driver.py
@@ -16,9 +16,35 @@
 def randcolor():
     return np.random.uniform(0.0, 0.89, (3,)) + 0.1
 
+### Subproblem 3 ###
+### Z-Order Implementation ###
+# Referenced from https://en.wikipedia.org/wiki/Z-order_curve
+def cmp_zorder(arr1, arr2, dim=2):
+        j = 0
+        k = 0
+        x = 0
+        a = arr1[1]
+        b = arr2[1]
+        for k in range(dim):
+            y = a[k] ^ b[k]
+            if less_msb(x, y):
+                j = k
+                x = y
+        return a[j] - b[j]
+
+def less_msb(x, y):
+        return x < y and x < (x ^ y)
+
+
 if __name__ == '__main__':
+    # Real Balls
     num_balls = 10000
     radius = 0.002
+
+    # Test Balls
+    # num_balls = 10
+    # radius = 0.05
+    
     positions = np.random.uniform(0 + radius, 1 - radius,
                                   (num_balls, 2)).astype(np.float32)
 
@@ -41,6 +67,7 @@ def randcolor():
     # Each square in the grid stores the index of the object in that square, or
     # -1 if no object.  We don't worry about overlapping objects, and just
     # store one of them.
+    # Grid Shape (708, 708)
     grid_spacing = radius / np.sqrt(2.0)
     grid_size = int((1.0 / grid_spacing) + 1)
     grid = - np.ones((grid_size, grid_size), dtype=np.uint32)
@@ -64,8 +91,7 @@ def randcolor():
     while True:
         with Timer() as t:
             update(positions, velocities, grid,
-                   radius, grid_size, locks_ptr,
-                   physics_step)
+                   radius, locks_ptr, physics_step, grid_spacing, grid_size)
 
         # udpate our estimate of how fast the simulator runs
         physics_step = 0.9 * physics_step + 0.1 * t.interval
@@ -77,6 +103,31 @@ def randcolor():
             print("{} simulation frames per second".format(frame_count / total_time))
             frame_count = 0
             total_time = 0
+            
             # SUBPROBLEM 3: sort objects by location.  Be sure to update the
             # grid if objects' indices change!  Also be sure to sort the
             # velocities with their object positions!
+            
+            # Create ball grid positions based on the grid spacing
+            grid_pos = (positions/grid_spacing).astype(int)
+
+            # Create ball grid positions based on the grid spacing
+            grid_pos = (positions/grid_spacing).astype(int)
+
+            # Zip an original index together with positons
+            pos_arr = zip(xrange(num_balls), grid_pos)
+            
+            # Sort the array based on the Zorder, and get the order indexs
+            pos_arr.sort(cmp_zorder)
+            z_idx = [idx for idx, pos in pos_arr]
+            
+            # Update position and Velocity
+            positions = positions[z_idx]; velocities = velocities[z_idx]
+
+            # Ensure that all balls are inside [0,1]
+            inside = np.array(filter(lambda x: x[0] > 0 and x[1] < 1, positions))
+            num_inside = len(inside)
+
+            # Update the inside balls
+            grid = - np.ones((grid_size, grid_size), dtype=np.uint32)
+            grid[(inside[:, 0]/grid_spacing).astype(int), (positions[:, 1]/grid_spacing).astype(int)] = np.array(range(num_inside))
\ No newline at end of file
diff --git a/HW2/P5/physics.pyx b/HW2/P5/physics.pyx
index dbd78e6e..f0d9cceb 100644
--- a/HW2/P5/physics.pyx
+++ b/HW2/P5/physics.pyx
@@ -1,15 +1,20 @@
 #cython: boundscheck=False, wraparound=False
 
+
 cimport numpy as np
 from libc.math cimport sqrt
 from libc.stdint cimport uintptr_t
 cimport cython
 from omp_defs cimport omp_lock_t, get_N_locks, free_N_locks, acquire, release
+from cython.parallel import parallel, prange, threadid
 
 # Useful types
 ctypedef np.float32_t FLOAT
 ctypedef np.uint32_t UINT
 
+# Unsigned Int -1 Global Variable
+cdef UINT neg_one = 4294967295
+
 cdef inline int overlapping(FLOAT *x1,
                             FLOAT *x2,
                             float R) nogil:
@@ -38,14 +43,18 @@ cdef inline void collide(FLOAT *x1, FLOAT *v1,
         float change_v1[2]
         float len_x1_m_x2, dot_v_x
         int dim
+    
     # https://en.wikipedia.org/wiki/Elastic_collision#Two-dimensional_collision_with_two_moving_objects
     for dim in range(2):
         x1_minus_x2[dim] = x1[dim] - x2[dim]
         v1_minus_v2[dim] = v1[dim] - v2[dim]
+    
     len_x1_m_x2 = x1_minus_x2[0] * x1_minus_x2[0] + x1_minus_x2[1] * x1_minus_x2[1]
     dot_v_x = v1_minus_v2[0] * x1_minus_x2[0] + v1_minus_v2[1] * x1_minus_x2[1]
+    
     for dim in range(2):
         change_v1[dim] = (dot_v_x / len_x1_m_x2) * x1_minus_x2[dim]
+    
     for dim in range(2):
         v1[dim] -= change_v1[dim]
         v2[dim] += change_v1[dim]  # conservation of momentum
@@ -55,46 +64,83 @@ cdef void sub_update(FLOAT[:, ::1] XY,
                      float R,
                      int i, int count,
                      UINT[:, ::1] Grid,
-                     float grid_spacing) nogil:
+                     float grid_spacing,
+                     int grid_size,
+                     omp_lock_t *locks) nogil:
     cdef:
         FLOAT *XY1, *XY2, *V1, *V2
-        int j, dim
+        int j, dim, x, y
         float eps = 1e-5
+        int pos_grid_x, pos_grid_y
 
     # SUBPROBLEM 4: Add locking
+    acquire(&(locks[i]))
     XY1 = &(XY[i, 0])
     V1 = &(V[i, 0])
+    
+    # Find the postion of the current ball i in the Grid, make sure to make unsigned int!
+    pos_grid_x = <int> (XY1[0] / grid_spacing)
+    pos_grid_y = <int> (XY1[1] / grid_spacing)
+
     #############################################################
     # IMPORTANT: do not collide two balls twice.
     ############################################################
+
     # SUBPROBLEM 2: use the grid values to reduce the number of other
     # objects to check for collisions.
-    for j in range(i + 1, count):
-        XY2 = &(XY[j, 0])
-        V2 = &(V[j, 0])
-        if overlapping(XY1, XY2, R):
-            # SUBPROBLEM 4: Add locking
-            if not moving_apart(XY1, V1, XY2, V2):
-                collide(XY1, V1, XY2, V2)
-
-            # give a slight impulse to help separate them
-            for dim in range(2):
-                V2[dim] += eps * (XY2[dim] - XY1[dim])
+    
+    # Ensure that the grid positions around the ball are not out of the grid
+    for x in range(max(pos_grid_x-3, 0), min(grid_size, pos_grid_x+3+1)):
+        for y in range(max(pos_grid_y-3, 0), min(grid_size, pos_grid_y+3+1)):
+            
+            # Grab the next ball
+            j = <int> Grid[x, y]
+            
+            # Ensure that the grid space is not empty and 
+            # that the order ball has not been previously compared
+            if j != neg_one and j>i:
+
+                XY2 = &(XY[j, 0])
+                V2 = &(V[j, 0])
+                
+                if overlapping(XY1, XY2, R):
+                    # SUBPROBLEM 4: Add locking
+                    
+                    # Acquire lock here to ensure no deadlocking
+                    acquire(&(locks[j]))
+                    if not moving_apart(XY1, V1, XY2, V2):
+                        collide(XY1, V1, XY2, V2)
+
+                    # give a slight impulse to help separate them
+                    for dim in range(2):
+                        V2[dim] += eps * (XY2[dim] - XY1[dim])
+                    
+                    # Release the comparsion ball lock
+                    release(&(locks[j]))
+
+    # Once we have compared all balls, release the i ball lock
+    release(&(locks[i])) 
 
 cpdef update(FLOAT[:, ::1] XY,
              FLOAT[:, ::1] V,
              UINT[:, ::1] Grid,
              float R,
-             float grid_spacing,
              uintptr_t locks_ptr,
-             float t):
+             float t,
+             float grid_spacing,
+             int grid_size):
+
     cdef:
         int count = XY.shape[0]
         int i, j, dim
         FLOAT *XY1, *XY2, *V1, *V2
+        
         # SUBPROBLEM 4: uncomment this code.
-        # omp_lock_t *locks = <omp_lock_t *> <void *> locks_ptr
-
+        omp_lock_t *locks = <omp_lock_t *> <void *> locks_ptr
+        
+        int new_grid_x, new_grid_y, old_grid_x, old_grid_y
+        int grid_pos_x, grid_pos_y
+    
     assert XY.shape[0] == V.shape[0]
     assert XY.shape[1] == V.shape[1] == 2
 
@@ -103,7 +149,8 @@ cpdef update(FLOAT[:, ::1] XY,
         #
         # SUBPROBLEM 1: parallelize this loop over 4 threads, with static
         # scheduling.
-        for i in range(count):
+        # for i in range(count):
+        for i in prange(count, schedule='static', chunksize=count/4, num_threads=4):
             for dim in range(2):
                 if (((XY[i, dim] < R) and (V[i, dim] < 0)) or
                     ((XY[i, dim] > 1.0 - R) and (V[i, dim] > 0))):
@@ -113,18 +160,36 @@ cpdef update(FLOAT[:, ::1] XY,
         #
         # SUBPROBLEM 1: parallelize this loop over 4 threads, with static
         # scheduling.
-        for i in range(count):
-            sub_update(XY, V, R, i, count, Grid, grid_spacing)
+        # for i in range(count):
+        for i in prange(count, schedule='static', chunksize=count/4, num_threads=4):
+            sub_update(XY, V, R, i, count, Grid, grid_spacing, grid_size, locks)
 
         # update positions
         #
         # SUBPROBLEM 1: parallelize this loop over 4 threads (with static
         #    scheduling).
         # SUBPROBLEM 2: update the grid values.
-        for i in range(count):
+        
+        # Reinitialize the grid with no balls
+        Grid[:,:] = -1
+
+        # for i in range(count):
+        for i in prange(count, schedule='static', chunksize=count/4, num_threads=4):
+            
             for dim in range(2):
                 XY[i, dim] += V[i, dim] * t
-
+                
+                # After updating, check to see if any positions are greater than [0,1]
+                if XY[i, dim] < 0:
+                    XY[i, dim] = 0
+                if XY[i,0] > 1:
+                    XY[i, dim] = 1
+
+            # Update the ball positions in the Grid
+            grid_pos_x = <int> (XY[i, 0]/grid_spacing)
+            grid_pos_y = <int> (XY[i, 1]/grid_spacing)
+            Grid[grid_pos_x , grid_pos_y] = i
+            
 
 def preallocate_locks(num_locks):
     cdef omp_lock_t *locks = get_N_locks(num_locks)
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
index 5a11c020..3ed88dca 100644
--- a/HW3/P2/mandelbrot.cl
+++ b/HW3/P2/mandelbrot.cl
@@ -9,11 +9,38 @@ mandelbrot(__global __read_only float *coords_real,
     const int y = get_global_id(1);
 
     float c_real, c_imag;
-    float z_real, z_imag;
+    float z_real, z_imag, z_r_tmp;
+    float mag;
     int iter;
 
-    if ((x < w) && (y < h)) {
+    if ((x < w) && (y < h)) 
+    {
         // YOUR CODE HERE
-        ;
+        c_real = coords_real[y * w + x];
+        c_imag = coords_imag[y * w + x];
+        
+        // Initialize z = c
+        z_real = c_real;
+        z_imag = c_imag;
+
+        // Initialize iter and mag
+        iter = 0;
+        mag = (z_real * z_real) + (z_imag * z_imag);
+        
+        // With no sqrt, compare against 4
+        while (mag < 4 && (iter < max_iter)) {
+          z_r_tmp = (z_real * z_real) - (z_imag * z_imag) + c_real;
+          z_imag = (2 * z_real * z_imag) + c_imag;
+          z_real = z_r_tmp;
+          
+          // Update mag and iter
+          mag = (z_real*z_real) + (z_imag * z_imag);
+          iter ++;
+        }
+    
+    // Write out to memeory
+    out_counts[y * w + x] = iter;
+      
     }
 }
+
diff --git a/HW3/P3/HW3_P3_results.txt b/HW3/P3/HW3_P3_results.txt
new file mode 100644
index 00000000..fa9c90d6
--- /dev/null
+++ b/HW3/P3/HW3_P3_results.txt
@@ -0,0 +1,84 @@
+coalesced reads, workgroups: 8, num_workers: 4, 0.15895688 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.09446624 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.05449312 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.02752128 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.01398152 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.00716592 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.09116224 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.05327176 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.02685536 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.01356072 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00713056 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.00408592 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.05368072 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.0276688 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.013952 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.0071984 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00409112 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00351304 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.02708528 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.01395608 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.007136 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00413312 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.00329616 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00326704 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.02718944 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.0141044 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.00727712 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.0040504 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00331768 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.00316304 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.02838392 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.0142284 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.00733648 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.0040524 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.0032536 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.00317048 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.02871936 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.01440816 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.00772872 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00432824 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00321256 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.00320032 seconds
+blocked reads, workgroups: 8, num_workers: 4, 0.18871568 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.09651504 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.05294464 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.025284 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.01244576 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.00875008 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.10793112 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.06889168 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.04178152 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.0179508 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.012344 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.03174904 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.0625028 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.03751328 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.0203404 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.01248976 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.0322088 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.07598152 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.0344596 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.01866488 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.01246112 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.0332296 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.0799416 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.08079288 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.03448208 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.01981064 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.01336208 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.03436896 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.08290584 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.06716448 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.03447048 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.01991312 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.01241752 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.03467448 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.06882584 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.04615288 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.03002832 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.02017728 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.01344304 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.03751952 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.04565488 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.0365916 seconds
\ No newline at end of file
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..a502cdeb
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,27 @@
+P3.txt
+
+The platforms detected are:
+---------------------------
+Apple Apple version: OpenCL 1.2 (May 10 2015 19:38:45)
+The devices detected on platform Apple are:
+---------------------------
+Intel(R) Core(TM) i5-4260U CPU @ 1.40GHz [Type: CPU ]
+Maximum clock Frequency: 1400 MHz
+Maximum allocable memory size: 2147 MB
+Maximum work group size 1024
+Maximum work item dimensions 3
+Maximum work item size [1024, 1, 1]
+---------------------------
+HD Graphics 5000 [Type: GPU ]
+Maximum clock Frequency: 1000 MHz
+Maximum allocable memory size: 402 MB
+Maximum work group size 512
+Maximum work item dimensions 3
+Maximum work item size [512, 512, 512]
+---------------------------
+
+configuration ('coalesced', 128, 128): 0.00316304 seconds
+
+BBased on my configuration (Mac Air ~2014) the coalesced read and sum with 128 work_groups and 128 workers came out as the winner. Based on the coalesced plot that I created for the output data, it is easy to see that as the number of work groups increases, the execution time of the sums decreases. This indicates that having a larger work group size helps increase performance when reading from memory in the coalesced manner.
+
+The block sum plot tells a different story. It seems that the optimal group size is 8 and that as the number of workers increases so does the performance. However, as we begin to increase the amount of work groups as well as workers, there is actually a decrease in performance. This maybe because the stride of k based on the global_size going out of memory bounds, which results in less work done per iteration and therefore decreased performance.
\ No newline at end of file
diff --git a/HW3/P3/P3_plot.py b/HW3/P3/P3_plot.py
new file mode 100644
index 00000000..7ed6db33
--- /dev/null
+++ b/HW3/P3/P3_plot.py
@@ -0,0 +1,42 @@
+# P3_plot.py
+# Plot Output
+import numpy as np
+import matplotlib.pyplot as plt
+ 
+if __name__ == '__main__':
+	# Read in data
+	results = np.loadtxt('HW3_P3_results2.txt', delimiter=',', dtype=str)
+
+ 	# Split data
+ 	execution_time = [x[-1].split()[0] for x in results]
+	workgroups = [x[1].split()[1] for x in results]
+	num_workers = [x[2].split()[1] for x in results]
+
+ 	### Plot coalesced ###
+ 	plt.figure(figsize=(10,8))
+	
+	for x in xrange(6, 43, 6):
+		plt.plot(num_workers[x-6:x], execution_time[x-6:x], label='Work Groups: %2s'%(workgroups[x-1]))
+		plt.scatter(num_workers[x-6:x], execution_time[x-6:x])
+	    
+	plt.legend()
+	plt.title("Coalesced Sums Execution Time")
+	plt.ylabel("Execution Time (seconds)")
+	plt.xlabel("Number of Workers")
+	plt.ylim(-.01, .18)
+	plt.xlim(-2,132)
+	plt.show()
+
+	### Plot Block ###
+	plt.figure(figsize=(10,8))
+	for x in xrange(int(84/2)+6, 85, 6):
+	    plt.plot(num_workers[x-6:x], execution_time[x-6:x], label='Work Groups: %2s'%(workgroups[x-1]))
+	    plt.scatter(num_workers[x-6:x], execution_time[x-6:x])
+
+	plt.legend()
+	plt.title("Blocked Reads Execution Time")
+	plt.ylabel("Execution Time (seconds)")
+	plt.xlabel("Number of Workers")
+	plt.ylim(-.02,.2)
+	plt.xlim(-2,132)
+	plt.show()
\ No newline at end of file
diff --git a/HW3/P3/blocked.png b/HW3/P3/blocked.png
new file mode 100644
index 00000000..a64a6364
Binary files /dev/null and b/HW3/P3/blocked.png differ
diff --git a/HW3/P3/coalesced.png b/HW3/P3/coalesced.png
new file mode 100644
index 00000000..7039f9a6
Binary files /dev/null and b/HW3/P3/coalesced.png differ
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
index 4fb771d2..436a2541 100644
--- a/HW3/P3/sum.cl
+++ b/HW3/P3/sum.cl
@@ -3,13 +3,24 @@ __kernel void sum_coalesced(__global float* x,
                             __local  float* fast,
                             long N)
 {
-    float sum = 0;
-    size_t local_id = get_local_id(0);
+    
+    // Get global id and size
+    size_t i = get_global_id(0);
+    size_t global_size = get_global_size(0);
 
+    // Get local id and size
+    size_t local_id = get_local_id(0);
+    size_t local_size = get_local_size(0);
+    
+    // Initialize sum
+    float sum = 0;
+    
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (uint k = 0; i + k*global_size < N-1; k++) { 
+        
+        // Sum x[i] + x[i + stride] + x[i + 2*stride] + ...
+        sum += x[i + k*global_size]; 
     }
 
     fast[local_id] = sum;
@@ -24,8 +35,13 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint stride=local_size/2; stride>0; stride >>= 1) {
+        // If the local_id is smaller than the stride then continue
+        if (local_id < stride)
+            fast[local_id] += fast[local_id + stride];
+        
+        // Add barrier to ensure that all adds are complete
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -36,9 +52,17 @@ __kernel void sum_blocked(__global float* x,
                           __local  float* fast,
                           long N)
 {
-    float sum = 0;
+    // Initialize get_x variables
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    uint local_size = get_local_size(0);
+    
+    // Get global ID
+    uint i = get_global_id(0);
+    int global_size = get_global_size(0);
+
+    // Initialize L = ceiling(N/global_size) and sum
+    int L = ceil((float)N / global_size);
+    float sum = 0;
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +72,10 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (int k=i*L;  k <= (i+1)*L-1; k++) { 
+        // Ensure that k stays in bounds relative to N
+        if (k < N)
+            sum += x[k];
     }
 
     fast[local_id] = sum;
@@ -61,11 +87,14 @@ __kernel void sum_blocked(__global float* x,
     // in fast[i], for offset = (local_size >> j) for j from 1 to
     // log_2(local_size)
     //
+
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (int stride=local_size/2; stride>0; stride >>= 1) {
+        if (local_id < stride)
+            fast[local_id] += fast[local_id + stride];
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
index c16e9fa6..a0d56da2 100644
--- a/HW3/P3/tune.py
+++ b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
index 07bb294c..53ab27a6 100644
--- a/HW3/P4/median_filter.cl
+++ b/HW3/P4/median_filter.cl
@@ -12,7 +12,26 @@ median_3x3(__global __read_only float *in_values,
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
+    
+    // Initialize the Local Position
+    // x = rows, y = cols
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+    
+    // Now the Global Position
+    const int gx = get_global_id(0);
+    const int gy = get_global_id(1);
 
+    // Local buffer top left pixel reference
+    const int buf_corner_x = gx - lx - halo;
+    const int buf_corner_y = gy - ly - halo;
+
+    // Local buffer location of current pixel
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
 
     // Load into buffer (with 1-pixel halo).
     //
@@ -21,14 +40,50 @@ median_3x3(__global __read_only float *in_values,
     //
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
+    
+    // Stay within image boundary
+    if (idx_1D < buf_w){
+
+        // Loop through rows
+        for (int row=0; row < buf_h; row++){
+            
+            // Save global reference based on buffer location
+            int x_ref = buf_corner_x + idx_1D;
+            int y_ref = buf_corner_y + row;
+
+            // Load buffer based on global location
+            // If y/x < 0, take 0 and if > h/w take h/w
+            // Kevin Chen helped me visual the min, max function 
+            buffer[row * buf_w + idx_1D] = in_values[ min( max(0, y_ref), (h-halo) ) * w \
+                                                    + min( max(0, x_ref), (w-halo) ) ];
+        }
 
+    }
+
+    // Ensure all buffer loads are complete
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
     // We've given you median9.h, and included it above, so you can
     // use the median9() function.
 
+    float filter_result;
+    // Ensure that global id stays within w, h
+    if ( (gx<w) && (gy<h) ){
+
+        // Pass 9 buffer value window to Median Filter
+        filter_result = median9(
+                    buffer[(buf_y-1) * buf_w + buf_x], buffer[(buf_y-1) * buf_w + (buf_x+1)], 
+                    buffer[buf_y * buf_w + (buf_x+1)], buffer[(buf_y+1) * buf_w + (buf_x+1)], 
+                    buffer[(buf_y+1) * buf_w + buf_x], buffer[(buf_y+1) * buf_w + (buf_x-1)], 
+                    buffer[(buf_y) * buf_w + (buf_x-1)], buffer[(buf_y-1) * buf_w + (buf_x-1)],
+                    buffer[buf_y * buf_w + buf_x]);
+
+        // Each thread in the valid region (x < w, y < h) should write
+        // back its 3x3 neighborhood median.
+        out_values[gy * w + gx] = filter_result;
+    }
 
-    // Each thread in the valid region (x < w, y < h) should write
-    // back its 3x3 neighborhood median.
+    
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
index 1eda1bb9..4146b14e 100644
--- a/HW3/P4/median_filter.py
+++ b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)
@@ -88,3 +89,11 @@ def numpy_median(image, iterations=10):
     cl.enqueue_copy(queue, host_image_filtered, gpu_image_a, is_blocking=True)
 
     assert np.allclose(host_image_filtered, numpy_median(host_image, num_iters))
+    
+    # pylab.imshow(host_image)
+    # pylab.title('Original')
+
+    pylab.imshow(host_image_filtered)
+    pylab.title('OpenCL Median Filter Applied to Image')    
+    pylab.savefig('openCL.png')
+    pylab.show()
diff --git a/HW3/P4/openCL.png b/HW3/P4/openCL.png
new file mode 100644
index 00000000..cc82ba3e
Binary files /dev/null and b/HW3/P4/openCL.png differ
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..07a0562f
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,41 @@
+P5.txt
+
+Part 1
+Maze 1
+Finished after 879 iterations, 408.4472 ms total, 0.46467258248 ms per iteration
+Found 2 regions
+Maze 2
+Finished after 512 iterations, 237.8964 ms total, 0.46464140625 ms per iteration
+Found 35 regions
+
+Part 2
+Maze 1
+Finished after 529 iterations, 245.60056 ms total, 0.464273270321 ms per iteration
+Found 2 regions
+Maze 2
+Finished after 273 iterations, 126.86768 ms total, 0.464716776557 ms per iteration
+Found 35 regions
+
+Implementing a grandfather label realizes a ~2x speedup in performance.
+
+Part 3
+Maze 1
+Finished after 10 iterations, 4.72224 ms total, 0.472224 ms per iteration
+Found 2 regions
+Maze 2
+Finished after 9 iterations, 4.27536 ms total, 0.47504 ms per iteration
+Found 35 regions
+
+The merge parent regions optimization step was a real game changer! By finding the min of of the old and new parents before writing back to global memory, we realize a ~50x/25x speedup!
+
+Part 4
+Maze 1
+Finished after 10 iterations, 11.20616 ms total, 1.120616 ms per iteration
+Found 2 regions
+Finished after 9 iterations, 9.93688 ms total, 1.10409777778 ms per iteration
+Found 35 regions
+
+Judging by the empirical results, the single thread approach with no repeated global memory reads is slower than the multi-thread approach (by a factor of ~3-4) and takes more time per iteration. One can imagine that there is a trade off associated with having one thread read the grandparents. This balance of work group re-reads is dependent on the amount of differing grandparent label values within the workgroup. If all labels are in fact the same, then one would expect a speedup as redundant values won't be read from global memory. However, if all the parents are different from one another, then the single thread approach would actually be slower than multiple threads reading in the grandfather labels.
+
+Part 5
+Although atomic operations are inefficient due to their serial nature and memory locks, they are also executed quickly because they are implemented at the hardware level. If we choose to implement this comparison with a min, it would not ensure that label values are not dynamically changed by multiple threads. This would result in some comparisons being performed out of order which would lead to a label values increasing. We should not see an increase at the end of an iteration, as all threads are comparing against the same old label. Wether or not this would take more execution time is dependent on the size of the local work group. If the work group is relative small then there should not be a substantial difference between min and atomic_min. However, if the local group size is large, then the serial nature of the atomic_min would make it slower than the min function.
\ No newline at end of file
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
index 78b986b3..354903c4 100644
--- a/HW3/P5/label_regions.cl
+++ b/HW3/P5/label_regions.cl
@@ -57,10 +57,13 @@ propagate_labels(__global __read_write int *labels,
     // 1D index of thread within our work-group
     const int idx_1D = ly * get_local_size(0) + lx;
     
-    int old_label;
     // Will store the output value
+    int old_label;
     int new_label;
     
+    // My Local Neighbor Variables
+    int ny_back, ny_forw, nx_up, nx_down;
+
     // Load the relevant labels to a local buffer with a halo 
     if (idx_1D < buf_w) {
         for (int row = 0; row < buf_h; row++) {
@@ -79,21 +82,65 @@ propagate_labels(__global __read_write int *labels,
     // the pixel for this thread
     old_label = buffer[buf_y * buf_w + buf_x];
 
-    // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
+    // Calculate Pix 
+    // Initialize Grandfather Label
+    int prev_label=0;
+    
+    // Start at the First Pixel (0, 0) location in local buffer
+    if (idx_1D == 0){   
+        
+        // Store grandparent for comparsion
+        if (old_label < w*h)
+            prev_label = labels[old_label];
+
+        // Loop through the local work group minus halo 
+        // ind = [y * buf_w + x]
+        for (int ind=1; ind < ((buf_w-halo) * (buf_h-halo)); ind++) {
+            
+            // Ensure value is within image bounds
+            if (buffer[ind] < w*h) {    
+                // Ensure grandfather label fetch is not equal to current label
+                // If true, update the current label to grandfather
+                if (buffer[ind] != prev_label)
+                    buffer[ind] = labels[buffer[ind]];
+            }
+        }
+    }  
+
+    //Ensure that all threads are done
+    barrier(CLK_LOCAL_MEM_FENCE);
     
     // stay in bounds
-    if ((x < w) && (y < h)) {
+    if ( (x < w) && (y < h) &&  (old_label < w*h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
-        new_label = old_label;
+        
+        // Grab Neighbor Pixels
+        // y = row, x = col 
+        // Referenced [row# * width + col#]
+        ny_forw = buffer[(buf_y+1) * buf_w + buf_x];
+        ny_back = buffer[(buf_y-1) * buf_w + buf_x];
+        nx_up = buffer[buf_y * buf_w + buf_x+1];
+        nx_down = buffer[buf_y * buf_w + buf_x-1];
+        
+        // Find the minimum pixel value from the 4 neighbors
+        new_label = min(old_label, min(ny_forw, min(ny_back, min(nx_down, nx_up))));
 
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
+
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
-        }
-    }
-}
+
+            // First, compare global new label with grandparent
+            atomic_min(&labels[old_label], new_label);
+            
+            // Now write back to the non-halo portion
+            // Ensure that a larger value will never be stored
+            atomic_min(&labels[y * w + x], new_label);
+
+        } 
+    } 
+}
\ No newline at end of file