diff --git a/HW2/P2/P2.py b/HW2/P2/P2.py index 697bcd0a..701d2fc0 100644 --- a/HW2/P2/P2.py +++ b/HW2/P2/P2.py @@ -10,6 +10,7 @@ import numpy as np from timer import Timer from parallel_vector import move_data_serial, move_data_fine_grained, move_data_medium_grained +import matplotlib.pyplot as plt if __name__ == '__main__': ######################################## @@ -21,6 +22,7 @@ total = orig_counts.sum() + # serial move counts = orig_counts.copy() with Timer() as t: @@ -29,23 +31,66 @@ print("Serial uncorrelated: {} seconds".format(t.interval)) serial_counts = counts.copy() - # fine grained - counts[:] = orig_counts - with Timer() as t: - move_data_fine_grained(counts, src, dest, 100) - assert counts.sum() == total, "Wrong total after move_data_fine_grained" - print("Fine grained uncorrelated: {} seconds".format(t.interval)) + ### fine grained, 4 threads, uncorrelated, multilock #### + # counts[:] = orig_counts + # with Timer() as t: + # move_data_fine_grained(counts, src, dest, 100, 4) + # assert counts.sum() == total, "Wrong total after move_data_fine_grained" + # print("Fine grained uncorrelated: {} seconds".format(t.interval)) + # for t in threads: + + ### fine grained, multithread, uncorrelated ### + # Save output for multiple Threads + threads = range(1,10) + time_fine_threads_uncor = [] + + for th in threads: + counts[:] = orig_counts + with Timer() as t: + move_data_fine_grained(counts, src, dest, 100, th) + assert counts.sum() == total, "Wrong total after move_data_fine_grained" + print("Number of Threads: {}".format(th)) + print("Fine grained uncorrelated: {} seconds".format(t.interval)) + time_fine_threads_uncor.append(t.interval) ######################################## # You should explore different values for the number of locks in the medium # grained locking ######################################## - N = 10 - counts[:] = orig_counts - with Timer() as t: - move_data_medium_grained(counts, src, dest, 100, N) - assert counts.sum() == total, "Wrong total after move_data_medium_grained" - print("Medium grained uncorrelated: {} seconds".format(t.interval)) + + ### Medium grained, 4 threads, uncorrelated multi-lock ### + # Create a range of N values for Part 1 + N_buffer = np.arange(1,21) + + # Create a few N values + N_pts = [1,2,4,5,10] + + # Save Medium grain time results + output_time_mg_uncor = [] + output_time_mg_uncor_pts = [] + + # for N in N_buffer: + # counts[:] = orig_counts + # with Timer() as t: + # move_data_medium_grained(counts, src, dest, 100, N, 4) + # assert counts.sum() == total, "Wrong total after move_data_medium_grained" + # if (N == N_pts).any(): + # print("Number of Locks: {}".format(N)) + # print("Medium grained uncorrelated: {} seconds".format(t.interval)) + # output_time_mg_uncor_pts.append(t.interval) + # output_time_mg_uncor.append(t.interval) + + ### Medium Grain, Multithread, Uncorrelated ### + time_med_threads_uncor = [] + for th in threads: + N=10 + counts[:] = orig_counts + with Timer() as t: + move_data_medium_grained(counts, src, dest, 100, N, th) + assert counts.sum() == total, "Wrong total after move_data_medium_grained" + print("Number of Threads: {}".format(th)) + print("Medium grained uncorrelated: {} seconds".format(t.interval)) + time_med_threads_uncor.append(t.interval) ######################################## # Now use correlated data movement @@ -62,21 +107,85 @@ assert counts.sum() == total, "Wrong total after move_data_serial" print("Serial correlated: {} seconds".format(t.interval)) serial_counts = counts.copy() + + ### fine grained, 4 threads, correlated, multilock #### + # counts[:] = orig_counts + # with Timer() as t: + # move_data_fine_grained(counts, src, dest, 100, 4) + # assert counts.sum() == total, "Wrong total after move_data_fine_grained" + # print("Fine grained correlated: {} seconds".format(t.interval)) - # fine grained - counts[:] = orig_counts - with Timer() as t: - move_data_fine_grained(counts, src, dest, 100) - assert counts.sum() == total, "Wrong total after move_data_fine_grained" - print("Fine grained correlated: {} seconds".format(t.interval)) - + ### fine grained, multithread, correlated ### + time_fine_threads_cor = [] + for th in threads: + counts[:] = orig_counts + with Timer() as t: + move_data_fine_grained(counts, src, dest, 100, th) + assert counts.sum() == total, "Wrong total after move_data_fine_grained" + print("Number of Locks: {}".format(th)) + print("Fine grained uncorrelated: {} seconds".format(t.interval)) + time_fine_threads_cor.append(t.interval) + ######################################## # You should explore different values for the number of locks in the medium # grained locking ######################################## - N = 10 - counts[:] = orig_counts - with Timer() as t: - move_data_medium_grained(counts, src, dest, 100, N) - assert counts.sum() == total, "Wrong total after move_data_medium_grained" - print("Medium grained correlated: {} seconds".format(t.interval)) + + ### medium grained, 4 threads, correlated, multilock #### + # output_time_mg_cor = [] + # output_time_mg_cor_pts = [] + # for N in N_buffer: + # counts[:] = orig_counts + # with Timer() as t: + # move_data_medium_grained(counts, src, dest, 100, N, 4) + # assert counts.sum() == total, "Wrong total after move_data_medium_grained" + # if (N == N_pts).any(): + # print("Number of Locks: {}".format(N)) + # print("Medium grained correlated: {} seconds".format(t.interval)) + # output_time_mg_cor_pts.append(t.interval) + # output_time_mg_cor.append(t.interval) + + + ### Medium Grain, Multithread, Correlated ### + time_med_threads_cor = [] + N=10 + for th in threads: + counts[:] = orig_counts + with Timer() as t: + move_data_medium_grained(counts, src, dest, 100, N, th) + assert counts.sum() == total, "Wrong total after move_data_medium_grained" + print("Number of Threads: {}".format(th)) + print("Medium grained uncorrelated: {} seconds".format(t.interval)) + time_med_threads_cor.append(t.interval) + + + ### Multi-N ploting #### + # plt.figure(figsize=(10,8)) + # plt.plot(N_buffer, output_time_mg_uncor) + # plt.plot(N_buffer, output_time_mg_cor) + # plt.scatter(N_pts, output_time_mg_uncor_pts, s=50, c='Red', label=u'UnCorrelated') + # plt.scatter(N_pts, output_time_mg_cor_pts, s=50, c='Green', label=u'Correlated') + # plt.title("Time of Array Suffle Correlated vs. Uncorrelated (Medium Grain Locking)") + # plt.xlabel("N") + # plt.ylabel("Completation Time") + # plt.legend(loc=2) + # plt.show() + + ### Multithread ploting #### + plt.figure(figsize=(10,8)) + + # Uncorrelated + plt.plot(threads, time_fine_threads_uncor, label=u'Fine Uncorrelated') + plt.plot(threads, time_med_threads_uncor, label=u'Medium Uncorrelated') + + # Correlated + plt.plot(threads, time_fine_threads_cor, label=u'Fine Correlated') + plt.plot(threads, time_med_threads_cor, label=u'Medium Correlated') + + # plt.scatter(N_pts, output_time_mg_uncor_pts, s=50, c='Red', label=u'UnCorrelated') + # plt.scatter(N_pts, output_time_mg_cor_pts, s=50, c='Green', label=u'Correlated') + plt.title("Time of Multithreaded Array Suffle Correlated vs. Uncorrelated") + plt.xlabel("Threads") + plt.ylabel("Completation Time") + plt.legend(loc=2) + plt.show() diff --git a/HW2/P2/P2.txt b/HW2/P2/P2.txt new file mode 100644 index 00000000..3d257b37 --- /dev/null +++ b/HW2/P2/P2.txt @@ -0,0 +1,42 @@ +### Part A ### + +# Uncorrelated +Serial uncorrelated: 0.327378034592 seconds +Fine grained uncorrelated: 8.42167806625 seconds + +Number of Locks: 1 +Medium grained uncorrelated: 9.59998893738 seconds +Number of Locks: 2 +Medium grained uncorrelated: 10.9623169899 seconds +Number of Locks: 4 +Medium grained uncorrelated: 11.0328228474 seconds +Number of Locks: 5 +Medium grained uncorrelated: 11.3029088974 seconds +Number of Locks: 10 +Medium grained uncorrelated: 9.50029206276 seconds + + +# Correlated +Serial correlated: 0.472969055176 seconds +Fine grained correlated: 7.73167490959 seconds + +Number of Locks: 1 +Medium grained correlated: 9.56373286247 seconds +Number of Locks: 2 +Medium grained correlated: 8.43163609505 seconds +Number of Locks: 4 +Medium grained correlated: 8.61915707588 seconds +Number of Locks: 5 +Medium grained correlated: 8.97014594078 seconds +Number of Locks: 10 +Medium grained correlated: 11.4742889404 seconds + +For this part of the problem, we fixed the number of threads to 4 and changed the amount of locks available to the medium grained data move. We can see from the plotted results that there is not much difference in execution time for the uncorrelated data move, i.e. the amount of data we lock for each thread does not change the performance of the data move. For the correlated data, we see that additional locks increase the performance. This makes sense as we increase the number of locks, there is additional data for threads to operate. The ideal amount of locks should be N=20 as this would ensure that the source and destination are no more than 10 elements away from one another in either direction. However, my results tell a different story. There is a bad performance "spikes" at N = 10 and 20. Based on the results I would pick N between 10 and 20. + +### Part B ### + +Serial uncorrelated: 0.36355805397 seconds +Serial correlated: 0.428131818771 seconds + +In part B we were required to fix the number of locks to 10 and see how performance varies as we enumerate the amount of the threads available to move data. From the plot we can see that there is a point of diminishing returns after 4 threads. The performance is improved by 4 threads but decreases very quickly as we increase the number of threads. Moreover, the threaded results are much worse than the serial results, this is due to the communication needed between threads to move the data. + diff --git a/HW2/P2/P2_multilock.png b/HW2/P2/P2_multilock.png new file mode 100644 index 00000000..1fb22b0e Binary files /dev/null and b/HW2/P2/P2_multilock.png differ diff --git a/HW2/P2/P2_mutlithread.png b/HW2/P2/P2_mutlithread.png new file mode 100644 index 00000000..3f13084c Binary files /dev/null and b/HW2/P2/P2_mutlithread.png differ diff --git a/HW2/P2/parallel_vector.pyx b/HW2/P2/parallel_vector.pyx index 5d67797f..900f2041 100644 --- a/HW2/P2/parallel_vector.pyx +++ b/HW2/P2/parallel_vector.pyx @@ -1,12 +1,10 @@ # turn off bounds checking & wraparound for arrays -#cython: boundscheck=False, wraparound=False +# cython: boundscheck=False, wraparound=False ################################################## # setup and helper code ################################################## - - -from cython.parallel import parallel, prange +from cython.parallel import parallel, prange, threadid from openmp cimport omp_lock_t, \ omp_init_lock, omp_destroy_lock, \ omp_set_lock, omp_unset_lock, omp_get_thread_num @@ -15,7 +13,6 @@ from libc.stdlib cimport malloc, free import numpy as np cimport numpy as np - # lock helper functions cdef void acquire(omp_lock_t *l) nogil: omp_set_lock(l) @@ -45,11 +42,71 @@ cdef void free_N_locks(int N, omp_lock_t *locks) nogil: free( locks) +# My function for checking lock conditions for fine grain +cdef void check_lock(int idx, omp_lock_t *locks, np.int32_t[:] counts, + np.int32_t[:] src, np.int32_t[:] dest) nogil: + + # If the source index is greater, grab it's lock first + # This prevents deadlocking + if src[idx] > dest[idx]: + acquire(&locks[src[idx]]) + acquire(&locks[dest[idx]]) + counts[dest[idx]] += 1 + counts[src[idx]] -= 1 + release(&locks[src[idx]]) + release(&locks[dest[idx]]) + + # If the destination's index is greater, grab it first + # Also prevents deadlocking + elif src[idx] < dest[idx]: + acquire(&locks[dest[idx]]) + acquire(&locks[src[idx]]) + counts[dest[idx]] += 1 + counts[src[idx]] -= 1 + release(&locks[src[idx]]) + release(&locks[dest[idx]]) + + # If the indexs are equal, only grab one lock + # This prevents double locking + else: + acquire(&locks[src[idx]]) + counts[dest[idx]] += 1 + counts[src[idx]] -= 1 + release(&locks[src[idx]]) + + +# My Function Updated for course grain locking +cdef void check_lock_med(int idx, int N, omp_lock_t *locks, np.int32_t[:] counts, + np.int32_t[:] src, np.int32_t[:] dest) nogil: + + # We must now check for the index/N as that cooresponds to the + # number of locks that are available + if src[idx]/N > dest[idx]/N: + acquire(&locks[src[idx]/N]) + acquire(&locks[dest[idx]/N]) + counts[dest[idx]] += 1 + counts[src[idx]] -= 1 + release(&locks[src[idx]/N]) + release(&locks[dest[idx]/N]) + + elif src[idx]/N < dest[idx]/N: + acquire(&locks[dest[idx]/N]) + acquire(&locks[src[idx]/N]) + counts[dest[idx]] += 1 + counts[src[idx]] -= 1 + release(&locks[src[idx]/N]) + release(&locks[dest[idx]/N]) + + else: + acquire(&locks[src[idx]/N]) + counts[dest[idx]] += 1 + counts[src[idx]] -= 1 + release(&locks[src[idx]/N]) + ################################################## # Your code below ################################################## - cpdef move_data_serial(np.int32_t[:] counts, np.int32_t[:] src, np.int32_t[:] dest, @@ -65,11 +122,12 @@ cpdef move_data_serial(np.int32_t[:] counts, counts[dest[idx]] += 1 counts[src[idx]] -= 1 - +# Updated move_data_fine_grained for parallel implementation cpdef move_data_fine_grained(np.int32_t[:] counts, np.int32_t[:] src, np.int32_t[:] dest, - int repeat): + int repeat, + int threads): cdef: int idx, r omp_lock_t *locks = get_N_locks(counts.shape[0]) @@ -79,12 +137,11 @@ cpdef move_data_fine_grained(np.int32_t[:] counts, # Use parallel.prange() and a lock for each element of counts to parallelize # data movement. Be sure to avoid deadlock, and double-locking. ########## - with nogil: - for r in range(repeat): - for idx in range(src.shape[0]): - if counts[src[idx]] > 0: - counts[dest[idx]] += 1 - counts[src[idx]] -= 1 + + for r in xrange(repeat): + for idx in prange(src.shape[0], nogil=True, num_threads=threads): + if counts[src[idx]] > 0: + check_lock(idx, locks, counts, src, dest) free_N_locks(counts.shape[0], locks) @@ -93,7 +150,8 @@ cpdef move_data_medium_grained(np.int32_t[:] counts, np.int32_t[:] src, np.int32_t[:] dest, int repeat, - int N): + int N, + int threads): cdef: int idx, r int num_locks = (counts.shape[0] + N - 1) / N # ensure enough locks @@ -105,11 +163,11 @@ cpdef move_data_medium_grained(np.int32_t[:] counts, # to parallelize data movement. Be sure to avoid deadlock, as well as # double-locking. ########## - with nogil: - for r in range(repeat): - for idx in range(src.shape[0]): - if counts[src[idx]] > 0: - counts[dest[idx]] += 1 - counts[src[idx]] -= 1 - - free_N_locks(num_locks, locks) + + for r in xrange(repeat): + for idx in prange(src.shape[0], nogil=True, num_threads=threads): + if counts[src[idx]] > 0: + check_lock_med(idx, N, locks, counts, src, dest) + + free_N_locks(num_locks, locks) + diff --git a/HW2/P3/P3.txt b/HW2/P3/P3.txt new file mode 100644 index 00000000..bef331c0 --- /dev/null +++ b/HW2/P3/P3.txt @@ -0,0 +1,26 @@ +P3.txt + +### Serial Implementation ### +1074.656613 Million Complex FMAs in 6.37947392464 seconds, 168.455365708 million Complex FMAs / second + +### Multithread Implementation ### +1 Thread +1074.656613 Million Complex FMAs in 6.59998703003 seconds, 162.827079525 million Complex FMAs / second +2 Threads +1074.656613 Million Complex FMAs in 3.32439684868 seconds, 323.263636057 million Complex FMAs / second +4 Threads +1074.656613 Million Complex FMAs in 1.87254285812 seconds, 573.902278571 million Complex FMAs / second + +### AVX Implementation ### +1 Thread +1074.656613 Million Complex FMAs in 1.8737821579 seconds, 573.522705652 million Complex FMAs / second +2 Threads +1074.957423 Million Complex FMAs in 0.47437620163 seconds, 2266.04416349 million Complex FMAs / second +4 Threads +1074.957423 Million Complex FMAs in 0.386723995209 seconds, 2779.65018028 million Complex FMAs / second + +In this problem we worked with the infamous Mandelbrot data set which we have become so familiar with over the past month, and the task was to implement the calculation first using multithreading, then instruction-level parallelism. In the multithread implementation, we can observe that there is performance decrease when comparing the serial version to the 1 threaded solution. This once again highlights the communication costs of using a threaded approach. However, once the number of thread are increased, a significant speed up in the calculation is achieved. By doubling the amount of threads, the same number of complex FMAs are performed in approximately half the time. In the 4 thread implementation, the performance is improved by x3.5. This shows that adding threads to a ridiculous parallel problem does in fact increase the performance. + +In the instruction-level parallelism portion of this problem, we were asked to implement the Mandelbrot calculation using 8-way instruction-level parallelism via AVX. The performance boost achieved by implementing SIMD is outstanding. The 8-way parallel implementation with just one thread is almost 4 times faster than the serial version. Using 4 threads, there is a speed up of ~x16! + +AVX was quite an adventure for me, I had no experience using SIMD but I have previous programmed in assembly. I have to say that it was very difficult, but also very rewarding. It's obvious that there are real performance benefits associated with AVX, but it comes with a price, which is a complex program. I had to completely change the way I thought about the problem and really consider every implementation decision that I made. The discovery of both fmadd and fmsub turned out to be very helpful for the real and imaginary z calculations. I spent a few office hour sessions walking through this calculation and I have to thank both Kevin, Phillipe and Adi for their assistance. \ No newline at end of file diff --git a/HW2/P3/driver.py b/HW2/P3/driver.py index 791eda47..b7ab9b4c 100644 --- a/HW2/P3/driver.py +++ b/HW2/P3/driver.py @@ -35,4 +35,4 @@ def make_coords(center=(-0.575 - 0.575j), print("{} Million Complex FMAs in {} seconds, {} million Complex FMAs / second".format(out_counts.sum() / 1e6, seconds, (out_counts.sum() / seconds) / 1e6)) plt.imshow(np.log(out_counts)) - plt.show() + plt.show() \ No newline at end of file diff --git a/HW2/P3/mandelbrot.pyx b/HW2/P3/mandelbrot.pyx index a019cfa2..2629d7e2 100644 --- a/HW2/P3/mandelbrot.pyx +++ b/HW2/P3/mandelbrot.pyx @@ -5,19 +5,18 @@ import numpy cimport AVX from cython.parallel import prange - cdef np.float64_t magnitude_squared(np.complex64_t z) nogil: return z.real * z.real + z.imag * z.imag +# cdef np.float32_t avx_mag_square(np.ndarray[np.float32_t, ndim=8] avx_z): +# return AVX.sqrt(avx_z) + @cython.boundscheck(False) @cython.wraparound(False) cpdef mandelbrot(np.complex64_t [:, :] in_coords, np.uint32_t [:, :] out_counts, int max_iterations=511): - cdef: - int i, j, iter - np.complex64_t c, z - + # To declare AVX.float8 variables, use: # cdef: # AVX.float8 v1, v2, v3 @@ -31,6 +30,18 @@ cpdef mandelbrot(np.complex64_t [:, :] in_coords, assert in_coords.shape[0] == out_counts.shape[0], "Input and output arrays must be the same size" assert in_coords.shape[1] == out_counts.shape[1], "Input and output arrays must be the same size" + #out_counts = serial_mandel(in_coords, out_counts, max_iterations) + #out_counts = mandel_multi(in_coords, out_counts, 4, max_iterations) + out_counts = mandel_AVX(in_coords, out_counts, 4, max_iterations) + +cpdef serial_mandel(np.complex64_t [:, :] in_coords, + np.uint32_t [:, :] out_counts, + int max_iterations=511): + cdef: + int i, j, iter + np.complex64_t c, z + + # Serial Code with nogil: for i in range(in_coords.shape[0]): for j in range(in_coords.shape[1]): @@ -41,17 +52,117 @@ cpdef mandelbrot(np.complex64_t [:, :] in_coords, break z = z * z + c out_counts[i, j] = iter + + return out_counts - +cpdef mandel_multi(np.complex64_t [:, :] in_coords, + np.uint32_t [:, :] out_counts, + int threads, + int max_iterations=511): + cdef: + int i, j, iter + np.complex64_t c, z + + # Completed Multithreading Code + for i in prange(in_coords.shape[0], nogil=True, schedule='static', chunksize=1, num_threads=threads): + for j in xrange(in_coords.shape[1]): + c = in_coords[i, j] + z = 0 + for iter in xrange(max_iterations): + if magnitude_squared(z) > 4: + break + z = z * z + c + out_counts[i, j] = iter + + return out_counts + +cpdef mandel_AVX(np.complex64_t [:, :] in_coords, + np.uint32_t [:, :] out_counts, + int threads, + int max_iterations=511): + + cdef: + AVX.float8 real_avx, imag_avx, complex_avx + AVX.float8 z_real, z_imag, z_r_tmp + AVX.float8 mag_real_avx, mag_imag_avx, mag_tot_avx + AVX.float8 cutoff_avx, two_avx, one_avx, mask_avx, compare_avx, count_avx, c_plus_one + + np.float32_t [:,:] real_np, imag_np + + int i, j, k, iter + + # Convert Complex to real and image float32 arrays + real_np = np.real(in_coords) + imag_np = np.imag(in_coords) + + # Initialize the real and imaginary AVX floats + real_avx = AVX.float_to_float8(0) + imag_avx = AVX.float_to_float8(0) + + # Initialize the magnitude vectors for real, imag, and output + mag_real_avx = AVX.float_to_float8(0) + mag_imag_avx = AVX.float_to_float8(0) + mag_tot_avx = AVX.float_to_float8(0) + + # Initialize cutoff=4, two=2, one=1, compare and mask AVX vectors + cutoff_avx = AVX.float_to_float8(4) + two_avx = AVX.float_to_float8(2) + one_avx = AVX.float_to_float8(1) + compare_avx = AVX.float_to_float8(0) + mask_avx = AVX.float_to_float8(0) + + for i in prange(in_coords.shape[0], nogil=True, schedule='static', chunksize=1, num_threads=threads): + #for i in xrange(in_coords.shape[0]): + for j in range(0, in_coords.shape[1], 8): + + # Initialize the Real and Imag parts of the AVX values + # c = in_coords[i, j] + real_avx = AVX.make_float8(real_np[i, j], real_np[i, j+1], real_np[i, j+2], real_np[i, j+3], + real_np[i, j+4], real_np[i, j+5], real_np[i, j+6], real_np[i, j+7]) + imag_avx = AVX.make_float8(imag_np[i, j], imag_np[i, j+1], imag_np[i, j+2], imag_np[i, j+3], + imag_np[i, j+4], imag_np[i, j+5], imag_np[i, j+6], imag_np[i, j+7]) + + # z = 0 + z_real = AVX.float_to_float8(0) + z_imag = AVX.float_to_float8(0) + z_r_tmp = AVX.float_to_float8(0) + count_avx = AVX.float_to_float8(0) + + for iter in range(max_iterations): + # Compute Magnitude + mag_real_avx = AVX.mul(z_real, z_real) + mag_tot_avx = AVX.fmadd(z_imag, z_imag, mag_real_avx) + + # Only modify AVX values if less than 4 + compare_avx = AVX.less_than(mag_tot_avx, cutoff_avx) + mask_avx = AVX.bitwise_and(one_avx, compare_avx) + count_avx = AVX.add(count_avx, mask_avx) + + # Break if all values are great than 4 + if not AVX.signs(compare_avx): + break + + # Update Z_real and Z_complex + z_r_tmp = AVX.fmsub(z_real, z_real, AVX.fmsub(z_imag, z_imag, real_avx)) + z_imag = AVX.fmadd(two_avx, AVX.mul(z_real, z_imag), imag_avx) + z_real = z_r_tmp + + # Update outcounts + for k in range(8): + out_counts[i,j+k] = (( &count_avx)[k]) + + return out_counts # An example using AVX instructions cpdef example_sqrt_8(np.float32_t[:] values): cdef: - AVX.float8 avxval, tmp, mask - float out_vals[8] + AVX.float8 avxval, tmp, mask, new + float out_vals[8], out_new[8] float [:] out_view = out_vals + float [:] new_view = out_new assert values.shape[0] == 8 + new = AVX.float_to_float8(5) # Note that the order of the arguments here is opposite the direction when # we retrieve them into memory. @@ -73,5 +184,9 @@ cpdef example_sqrt_8(np.float32_t[:] values): avxval = AVX.bitwise_andnot(mask, avxval) AVX.to_mem(avxval, &(out_vals[0])) + AVX.to_mem(new, &(out_new[0])) + + print("Out View:", np.array(out_view)) + print("New:", np.array(new_view)) return np.array(out_view) diff --git a/HW2/P4/P4.txt b/HW2/P4/P4.txt new file mode 100644 index 00000000..8ecfae77 --- /dev/null +++ b/HW2/P4/P4.txt @@ -0,0 +1,14 @@ +I referenced the following webpage for this problem and collaborated with Patrick Kuiper: +https://pymotw.com/2/threading/ + +Original Run +5.32677793503 seconds for 10 filter passes. + +Threaded Code Runs +5.90390110016 seconds for 10 filter passes and 1 threads +3.10576891899 seconds for 10 filter passes and 2 threads +2.29144191742 seconds for 10 filter passes and 4 threads + +In this problem we discovered how to implement python threads using events objects. I used the events object call to keep track of each thread’s progress for every iteration of the image filter. Events allow for threads to keep track of one another and to sync up between iterations. I found the "set()" logic to be counter intuitive as you set the current processing thread, then alert it's neighbor threads that they must "wait()" until each other have completed their calculation. I originally thought that this syntax would be the other way around, which led to many hours of python event fun! + +Based on my results, one can see that adding more threads does speed the median filter algorithm up. However, there is an overhead associated with establishing threads as the single thread execution took longer than the no thread original test. \ No newline at end of file diff --git a/HW2/P4/driver.py b/HW2/P4/driver.py index 0051a121..a0ea23e2 100644 --- a/HW2/P4/driver.py +++ b/HW2/P4/driver.py @@ -27,6 +27,63 @@ def py_median_3x3(image, iterations=10, num_threads=1): return tmpA + +def worker(events, thread_id, thread_num, iterations, orig_imag, new_imag): + for i in range(iterations): + + # Do nothing for the first iteration, however still needed for i-1 + if i == 0: + pass + + # Do nothing if there is only one thread + elif thread_num == 1: + pass + + # Corner case first thread only has 1 dependent thread + elif thread_id == 0: + events[thread_id+1, i-1].wait() + + # Corner case last thread only has 1 dependent thread + elif thread_id == thread_num-1: + events[thread_id-1, i-1].wait() + + # All other threads have 2 dependents + # Tell both previous and next to wait on current thread + else: + events[thread_id-1, i-1].wait() + events[thread_id+1, i-1].wait() + + # Run provided cython function to filter + filtering.median_3x3(orig_imag, new_imag, thread_id, thread_num) + + # Set current thread ID so that dependent threads know to wait + events[thread_id, i].set() + + # Simultaneous change original image and new filtered image + orig_imag, new_imag = new_imag, orig_imag + +def py_median_3x3_threaded(image, iterations, num_threads): + ''' filter with a 3x3 median threading ''' + tmpA = image.copy() + tmpB = np.empty_like(tmpA) + + # Create the event array for all threads and every iteration + events = np.array([[threading.Event()]*iterations]*num_threads) + + # Create Threads given the worker function and start + for t in range(num_threads): + threads = threading.Thread(target=worker, args=(events, t, num_threads, iterations, tmpA, tmpB)) + threads.start() + + # Join Threads to ensure all threads have completed their work + threads.join() + + # If odd number of interations return empty image + if iterations % 2 == 1: + return tmpB + else: + return tmpA + def numpy_median(image, iterations=10): ''' filter using numpy ''' for i in range(iterations): @@ -42,26 +99,47 @@ def numpy_median(image, iterations=10): if __name__ == '__main__': input_image = np.load('image.npz')['image'].astype(np.float32) - pylab.gray() + # pylab.gray() - pylab.imshow(input_image) - pylab.title('original image') + # pylab.imshow(input_image) + # pylab.title('original image') - pylab.figure() - pylab.imshow(input_image[1200:1800, 3000:3500]) - pylab.title('before - zoom') + # pylab.figure() + # pylab.imshow(input_image[1200:1800, 3000:3500]) + # pylab.title('before - zoom') # verify correctness - from_cython = py_median_3x3(input_image, 2, 5) - from_numpy = numpy_median(input_image, 2) - assert np.all(from_cython == from_numpy) + # from_cython = py_median_3x3(input_image, 2, 5) + # from_numpy = numpy_median(input_image, 2) + # assert np.all(from_cython == from_numpy) with Timer() as t: - new_image = py_median_3x3(input_image, 10, 8) + new_image = py_median_3x3(input_image, 10, 1) - pylab.figure() - pylab.imshow(new_image[1200:1800, 3000:3500]) - pylab.title('after - zoom') + # pylab.figure() + # pylab.imshow(new_image[1200:1800, 3000:3500]) + # pylab.title('after - zoom') print("{} seconds for 10 filter passes.".format(t.interval)) - pylab.show() + # pylab.show() + + ### My Code ### + from_cython = py_median_3x3_threaded(input_image, 2, 5) + from_numpy = numpy_median(input_image, 2) + assert np.all(from_cython == from_numpy), "Images don't match!" + + with Timer() as time1: + threaded_image = py_median_3x3_threaded(input_image, 10, 1) + + with Timer() as time2: + threaded_image = py_median_3x3_threaded(input_image, 10, 2) + + with Timer() as time4: + threaded_image = py_median_3x3_threaded(input_image, 10, 4) + + results = [(1, time1), (2, time2), (4, time4)] + + for t, r in results: + print("{} seconds for 10 filter passes and {} threads".format(r.interval, t)) + + diff --git a/HW2/P5/P5.txt b/HW2/P5/P5.txt new file mode 100644 index 00000000..466ccae5 --- /dev/null +++ b/HW2/P5/P5.txt @@ -0,0 +1,42 @@ +P5.txt + +### SUBPROBLEM 1: multithreading ### + +# Original Implementation +10 frame update average ~ 8.4886844904079997 simulation frames per second +# 1 Thread +10 frame update average ~ 8.9781141036150025 simulation frames per second +# 4 threads +10 frame update average ~ 13.478761244959998 simulation frames per second + +For subproblem 1, we updated the physics engine with multithreading (via a prange in update function). We can see from the average simulation frames per second that adding additional threads for computation slightly increases the performance of the program. However, these updates are not substantial and as we discovered later in the problem, there is room for improvement. + +### SUBPROBLEM 2: Spatial decomposition ### + +# 1 Thread +10 frame update average ~ 327.77711671729998 simulation frames per second +# 4 Threads +10 frame update average ~ 488.2193716649 simulation frames per second + +For subproblem 2, we see a huge increase in performance due to the spatial decomposition. The SD was achieved by finding the position of the balls in the grid and only searching in a small box around the ball of interest instead of searching all balls in space. This significantly cuts down the amount of comparisons needed to see if balls collide and there are performance benefits from doing so. + +### SUBPROBLEM 3: Spatially Coherent Sorting ### + +# 1 Thread +10 frame update average ~ 539.95193751989996 simulation frames per second +# 4 Threads +10 frame update average ~ 728.73026065850001 simulation frames per second + +In subproblem 3, we were asked to arrange the balls in to Morton/Zorder or Hilbert spacing in order for balls that are near each other in the grid to be located near each in memory. I choose to implement a morton curve as I wasn't familiar with either Morton or Hilbert and the Morton/Zorder curve seemed easier to implement. The Zorder is calculated by interleaving bits for 2 positions together (e.g. [0,1] and [1,0] would become [0110]) then sorting based on this new representation. We see a performance increase due to the fact that the balls are closer together in memory and therefore we don't have to go as far in memory for the comparison. + +I used the following website as a reference and collaborated with Patrick Kupier on this subproblem: +http://www.thejach.com/view/2011/9/playing_with_morton_numbers + +### SUBPROBLEM 4: Locks ### + +# 1 Thread +10 frame update average ~ 404.76805630230001 simulation frames per second +# 4 Threads +10 frame update average ~ 644.6591968916 simulation frames per second + +In subproblem 4, we were tasked to implement locks in order to ensure that the multiple threads weren't altering the positions of the balls. Since the threads are implementing locks in a fine grain fashion, we actually realize degraded performance. Although the performance of the program is decreased, our program actually becomes more realistic as we are not updating collisions multiple times for the same collision. In the end, our parallel implementation is approx. 90x faster than the serial implementation. Parallel coding is the way to go! \ No newline at end of file diff --git a/HW2/P5/driver.py b/HW2/P5/driver.py index fdd1a102..69727a62 100644 --- a/HW2/P5/driver.py +++ b/HW2/P5/driver.py @@ -16,9 +16,35 @@ def randcolor(): return np.random.uniform(0.0, 0.89, (3,)) + 0.1 +### Subproblem 3 ### +### Z-Order Implementation ### +# Referenced from https://en.wikipedia.org/wiki/Z-order_curve +def cmp_zorder(arr1, arr2, dim=2): + j = 0 + k = 0 + x = 0 + a = arr1[1] + b = arr2[1] + for k in range(dim): + y = a[k] ^ b[k] + if less_msb(x, y): + j = k + x = y + return a[j] - b[j] + +def less_msb(x, y): + return x < y and x < (x ^ y) + + if __name__ == '__main__': + # Real Balls num_balls = 10000 radius = 0.002 + + # Test Balls + # num_balls = 10 + # radius = 0.05 + positions = np.random.uniform(0 + radius, 1 - radius, (num_balls, 2)).astype(np.float32) @@ -41,6 +67,7 @@ def randcolor(): # Each square in the grid stores the index of the object in that square, or # -1 if no object. We don't worry about overlapping objects, and just # store one of them. + # Grid Shape (708, 708) grid_spacing = radius / np.sqrt(2.0) grid_size = int((1.0 / grid_spacing) + 1) grid = - np.ones((grid_size, grid_size), dtype=np.uint32) @@ -64,8 +91,7 @@ def randcolor(): while True: with Timer() as t: update(positions, velocities, grid, - radius, grid_size, locks_ptr, - physics_step) + radius, locks_ptr, physics_step, grid_spacing, grid_size) # udpate our estimate of how fast the simulator runs physics_step = 0.9 * physics_step + 0.1 * t.interval @@ -77,6 +103,31 @@ def randcolor(): print("{} simulation frames per second".format(frame_count / total_time)) frame_count = 0 total_time = 0 + # SUBPROBLEM 3: sort objects by location. Be sure to update the # grid if objects' indices change! Also be sure to sort the # velocities with their object positions! + + # Create ball grid positions based on the grid spacing + grid_pos = (positions/grid_spacing).astype(int) + + # Create ball grid positions based on the grid spacing + grid_pos = (positions/grid_spacing).astype(int) + + # Zip an original index together with positons + pos_arr = zip(xrange(num_balls), grid_pos) + + # Sort the array based on the Zorder, and get the order indexs + pos_arr.sort(cmp_zorder) + z_idx = [idx for idx, pos in pos_arr] + + # Update position and Velocity + positions = positions[z_idx]; velocities = velocities[z_idx] + + # Ensure that all balls are inside [0,1] + inside = np.array(filter(lambda x: x[0] > 0 and x[1] < 1, positions)) + num_inside = len(inside) + + # Update the inside balls + grid = - np.ones((grid_size, grid_size), dtype=np.uint32) + grid[(inside[:, 0]/grid_spacing).astype(int), (positions[:, 1]/grid_spacing).astype(int)] = np.array(range(num_inside)) \ No newline at end of file diff --git a/HW2/P5/physics.pyx b/HW2/P5/physics.pyx index dbd78e6e..f0d9cceb 100644 --- a/HW2/P5/physics.pyx +++ b/HW2/P5/physics.pyx @@ -1,15 +1,20 @@ #cython: boundscheck=False, wraparound=False + cimport numpy as np from libc.math cimport sqrt from libc.stdint cimport uintptr_t cimport cython from omp_defs cimport omp_lock_t, get_N_locks, free_N_locks, acquire, release +from cython.parallel import parallel, prange, threadid # Useful types ctypedef np.float32_t FLOAT ctypedef np.uint32_t UINT +# Unsigned Int -1 Global Variable +cdef UINT neg_one = 4294967295 + cdef inline int overlapping(FLOAT *x1, FLOAT *x2, float R) nogil: @@ -38,14 +43,18 @@ cdef inline void collide(FLOAT *x1, FLOAT *v1, float change_v1[2] float len_x1_m_x2, dot_v_x int dim + # https://en.wikipedia.org/wiki/Elastic_collision#Two-dimensional_collision_with_two_moving_objects for dim in range(2): x1_minus_x2[dim] = x1[dim] - x2[dim] v1_minus_v2[dim] = v1[dim] - v2[dim] + len_x1_m_x2 = x1_minus_x2[0] * x1_minus_x2[0] + x1_minus_x2[1] * x1_minus_x2[1] dot_v_x = v1_minus_v2[0] * x1_minus_x2[0] + v1_minus_v2[1] * x1_minus_x2[1] + for dim in range(2): change_v1[dim] = (dot_v_x / len_x1_m_x2) * x1_minus_x2[dim] + for dim in range(2): v1[dim] -= change_v1[dim] v2[dim] += change_v1[dim] # conservation of momentum @@ -55,46 +64,83 @@ cdef void sub_update(FLOAT[:, ::1] XY, float R, int i, int count, UINT[:, ::1] Grid, - float grid_spacing) nogil: + float grid_spacing, + int grid_size, + omp_lock_t *locks) nogil: cdef: FLOAT *XY1, *XY2, *V1, *V2 - int j, dim + int j, dim, x, y float eps = 1e-5 + int pos_grid_x, pos_grid_y # SUBPROBLEM 4: Add locking + acquire(&(locks[i])) XY1 = &(XY[i, 0]) V1 = &(V[i, 0]) + + # Find the postion of the current ball i in the Grid, make sure to make unsigned int! + pos_grid_x = (XY1[0] / grid_spacing) + pos_grid_y = (XY1[1] / grid_spacing) + ############################################################# # IMPORTANT: do not collide two balls twice. ############################################################ + # SUBPROBLEM 2: use the grid values to reduce the number of other # objects to check for collisions. - for j in range(i + 1, count): - XY2 = &(XY[j, 0]) - V2 = &(V[j, 0]) - if overlapping(XY1, XY2, R): - # SUBPROBLEM 4: Add locking - if not moving_apart(XY1, V1, XY2, V2): - collide(XY1, V1, XY2, V2) - - # give a slight impulse to help separate them - for dim in range(2): - V2[dim] += eps * (XY2[dim] - XY1[dim]) + + # Ensure that the grid positions around the ball are not out of the grid + for x in range(max(pos_grid_x-3, 0), min(grid_size, pos_grid_x+3+1)): + for y in range(max(pos_grid_y-3, 0), min(grid_size, pos_grid_y+3+1)): + + # Grab the next ball + j = Grid[x, y] + + # Ensure that the grid space is not empty and + # that the order ball has not been previously compared + if j != neg_one and j>i: + + XY2 = &(XY[j, 0]) + V2 = &(V[j, 0]) + + if overlapping(XY1, XY2, R): + # SUBPROBLEM 4: Add locking + + # Acquire lock here to ensure no deadlocking + acquire(&(locks[j])) + if not moving_apart(XY1, V1, XY2, V2): + collide(XY1, V1, XY2, V2) + + # give a slight impulse to help separate them + for dim in range(2): + V2[dim] += eps * (XY2[dim] - XY1[dim]) + + # Release the comparsion ball lock + release(&(locks[j])) + + # Once we have compared all balls, release the i ball lock + release(&(locks[i])) cpdef update(FLOAT[:, ::1] XY, FLOAT[:, ::1] V, UINT[:, ::1] Grid, float R, - float grid_spacing, uintptr_t locks_ptr, - float t): + float t, + float grid_spacing, + int grid_size): + cdef: int count = XY.shape[0] int i, j, dim FLOAT *XY1, *XY2, *V1, *V2 + # SUBPROBLEM 4: uncomment this code. - # omp_lock_t *locks = locks_ptr - + omp_lock_t *locks = locks_ptr + + int new_grid_x, new_grid_y, old_grid_x, old_grid_y + int grid_pos_x, grid_pos_y + assert XY.shape[0] == V.shape[0] assert XY.shape[1] == V.shape[1] == 2 @@ -103,7 +149,8 @@ cpdef update(FLOAT[:, ::1] XY, # # SUBPROBLEM 1: parallelize this loop over 4 threads, with static # scheduling. - for i in range(count): + # for i in range(count): + for i in prange(count, schedule='static', chunksize=count/4, num_threads=4): for dim in range(2): if (((XY[i, dim] < R) and (V[i, dim] < 0)) or ((XY[i, dim] > 1.0 - R) and (V[i, dim] > 0))): @@ -113,18 +160,36 @@ cpdef update(FLOAT[:, ::1] XY, # # SUBPROBLEM 1: parallelize this loop over 4 threads, with static # scheduling. - for i in range(count): - sub_update(XY, V, R, i, count, Grid, grid_spacing) + # for i in range(count): + for i in prange(count, schedule='static', chunksize=count/4, num_threads=4): + sub_update(XY, V, R, i, count, Grid, grid_spacing, grid_size, locks) # update positions # # SUBPROBLEM 1: parallelize this loop over 4 threads (with static # scheduling). # SUBPROBLEM 2: update the grid values. - for i in range(count): + + # Reinitialize the grid with no balls + Grid[:,:] = -1 + + # for i in range(count): + for i in prange(count, schedule='static', chunksize=count/4, num_threads=4): + for dim in range(2): XY[i, dim] += V[i, dim] * t - + + # After updating, check to see if any positions are greater than [0,1] + if XY[i, dim] < 0: + XY[i, dim] = 0 + if XY[i,0] > 1: + XY[i, dim] = 1 + + # Update the ball positions in the Grid + grid_pos_x = (XY[i, 0]/grid_spacing) + grid_pos_y = (XY[i, 1]/grid_spacing) + Grid[grid_pos_x , grid_pos_y] = i + def preallocate_locks(num_locks): cdef omp_lock_t *locks = get_N_locks(num_locks) diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl index 5a11c020..3ed88dca 100644 --- a/HW3/P2/mandelbrot.cl +++ b/HW3/P2/mandelbrot.cl @@ -9,11 +9,38 @@ mandelbrot(__global __read_only float *coords_real, const int y = get_global_id(1); float c_real, c_imag; - float z_real, z_imag; + float z_real, z_imag, z_r_tmp; + float mag; int iter; - if ((x < w) && (y < h)) { + if ((x < w) && (y < h)) + { // YOUR CODE HERE - ; + c_real = coords_real[y * w + x]; + c_imag = coords_imag[y * w + x]; + + // Initialize z = c + z_real = c_real; + z_imag = c_imag; + + // Initialize iter and mag + iter = 0; + mag = (z_real * z_real) + (z_imag * z_imag); + + // With no sqrt, compare against 4 + while (mag < 4 && (iter < max_iter)) { + z_r_tmp = (z_real * z_real) - (z_imag * z_imag) + c_real; + z_imag = (2 * z_real * z_imag) + c_imag; + z_real = z_r_tmp; + + // Update mag and iter + mag = (z_real*z_real) + (z_imag * z_imag); + iter ++; + } + + // Write out to memeory + out_counts[y * w + x] = iter; + } } + diff --git a/HW3/P3/HW3_P3_results.txt b/HW3/P3/HW3_P3_results.txt new file mode 100644 index 00000000..fa9c90d6 --- /dev/null +++ b/HW3/P3/HW3_P3_results.txt @@ -0,0 +1,84 @@ +coalesced reads, workgroups: 8, num_workers: 4, 0.15895688 seconds +coalesced reads, workgroups: 8, num_workers: 8, 0.09446624 seconds +coalesced reads, workgroups: 8, num_workers: 16, 0.05449312 seconds +coalesced reads, workgroups: 8, num_workers: 32, 0.02752128 seconds +coalesced reads, workgroups: 8, num_workers: 64, 0.01398152 seconds +coalesced reads, workgroups: 8, num_workers: 128, 0.00716592 seconds +coalesced reads, workgroups: 16, num_workers: 4, 0.09116224 seconds +coalesced reads, workgroups: 16, num_workers: 8, 0.05327176 seconds +coalesced reads, workgroups: 16, num_workers: 16, 0.02685536 seconds +coalesced reads, workgroups: 16, num_workers: 32, 0.01356072 seconds +coalesced reads, workgroups: 16, num_workers: 64, 0.00713056 seconds +coalesced reads, workgroups: 16, num_workers: 128, 0.00408592 seconds +coalesced reads, workgroups: 32, num_workers: 4, 0.05368072 seconds +coalesced reads, workgroups: 32, num_workers: 8, 0.0276688 seconds +coalesced reads, workgroups: 32, num_workers: 16, 0.013952 seconds +coalesced reads, workgroups: 32, num_workers: 32, 0.0071984 seconds +coalesced reads, workgroups: 32, num_workers: 64, 0.00409112 seconds +coalesced reads, workgroups: 32, num_workers: 128, 0.00351304 seconds +coalesced reads, workgroups: 64, num_workers: 4, 0.02708528 seconds +coalesced reads, workgroups: 64, num_workers: 8, 0.01395608 seconds +coalesced reads, workgroups: 64, num_workers: 16, 0.007136 seconds +coalesced reads, workgroups: 64, num_workers: 32, 0.00413312 seconds +coalesced reads, workgroups: 64, num_workers: 64, 0.00329616 seconds +coalesced reads, workgroups: 64, num_workers: 128, 0.00326704 seconds +coalesced reads, workgroups: 128, num_workers: 4, 0.02718944 seconds +coalesced reads, workgroups: 128, num_workers: 8, 0.0141044 seconds +coalesced reads, workgroups: 128, num_workers: 16, 0.00727712 seconds +coalesced reads, workgroups: 128, num_workers: 32, 0.0040504 seconds +coalesced reads, workgroups: 128, num_workers: 64, 0.00331768 seconds +coalesced reads, workgroups: 128, num_workers: 128, 0.00316304 seconds +coalesced reads, workgroups: 256, num_workers: 4, 0.02838392 seconds +coalesced reads, workgroups: 256, num_workers: 8, 0.0142284 seconds +coalesced reads, workgroups: 256, num_workers: 16, 0.00733648 seconds +coalesced reads, workgroups: 256, num_workers: 32, 0.0040524 seconds +coalesced reads, workgroups: 256, num_workers: 64, 0.0032536 seconds +coalesced reads, workgroups: 256, num_workers: 128, 0.00317048 seconds +coalesced reads, workgroups: 512, num_workers: 4, 0.02871936 seconds +coalesced reads, workgroups: 512, num_workers: 8, 0.01440816 seconds +coalesced reads, workgroups: 512, num_workers: 16, 0.00772872 seconds +coalesced reads, workgroups: 512, num_workers: 32, 0.00432824 seconds +coalesced reads, workgroups: 512, num_workers: 64, 0.00321256 seconds +coalesced reads, workgroups: 512, num_workers: 128, 0.00320032 seconds +blocked reads, workgroups: 8, num_workers: 4, 0.18871568 seconds +blocked reads, workgroups: 8, num_workers: 8, 0.09651504 seconds +blocked reads, workgroups: 8, num_workers: 16, 0.05294464 seconds +blocked reads, workgroups: 8, num_workers: 32, 0.025284 seconds +blocked reads, workgroups: 8, num_workers: 64, 0.01244576 seconds +blocked reads, workgroups: 8, num_workers: 128, 0.00875008 seconds +blocked reads, workgroups: 16, num_workers: 4, 0.10793112 seconds +blocked reads, workgroups: 16, num_workers: 8, 0.06889168 seconds +blocked reads, workgroups: 16, num_workers: 16, 0.04178152 seconds +blocked reads, workgroups: 16, num_workers: 32, 0.0179508 seconds +blocked reads, workgroups: 16, num_workers: 64, 0.012344 seconds +blocked reads, workgroups: 16, num_workers: 128, 0.03174904 seconds +blocked reads, workgroups: 32, num_workers: 4, 0.0625028 seconds +blocked reads, workgroups: 32, num_workers: 8, 0.03751328 seconds +blocked reads, workgroups: 32, num_workers: 16, 0.0203404 seconds +blocked reads, workgroups: 32, num_workers: 32, 0.01248976 seconds +blocked reads, workgroups: 32, num_workers: 64, 0.0322088 seconds +blocked reads, workgroups: 32, num_workers: 128, 0.07598152 seconds +blocked reads, workgroups: 64, num_workers: 4, 0.0344596 seconds +blocked reads, workgroups: 64, num_workers: 8, 0.01866488 seconds +blocked reads, workgroups: 64, num_workers: 16, 0.01246112 seconds +blocked reads, workgroups: 64, num_workers: 32, 0.0332296 seconds +blocked reads, workgroups: 64, num_workers: 64, 0.0799416 seconds +blocked reads, workgroups: 64, num_workers: 128, 0.08079288 seconds +blocked reads, workgroups: 128, num_workers: 4, 0.03448208 seconds +blocked reads, workgroups: 128, num_workers: 8, 0.01981064 seconds +blocked reads, workgroups: 128, num_workers: 16, 0.01336208 seconds +blocked reads, workgroups: 128, num_workers: 32, 0.03436896 seconds +blocked reads, workgroups: 128, num_workers: 64, 0.08290584 seconds +blocked reads, workgroups: 128, num_workers: 128, 0.06716448 seconds +blocked reads, workgroups: 256, num_workers: 4, 0.03447048 seconds +blocked reads, workgroups: 256, num_workers: 8, 0.01991312 seconds +blocked reads, workgroups: 256, num_workers: 16, 0.01241752 seconds +blocked reads, workgroups: 256, num_workers: 32, 0.03467448 seconds +blocked reads, workgroups: 256, num_workers: 64, 0.06882584 seconds +blocked reads, workgroups: 256, num_workers: 128, 0.04615288 seconds +blocked reads, workgroups: 512, num_workers: 4, 0.03002832 seconds +blocked reads, workgroups: 512, num_workers: 8, 0.02017728 seconds +blocked reads, workgroups: 512, num_workers: 16, 0.01344304 seconds +blocked reads, workgroups: 512, num_workers: 32, 0.03751952 seconds +blocked reads, workgroups: 512, num_workers: 64, 0.04565488 seconds +blocked reads, workgroups: 512, num_workers: 128, 0.0365916 seconds \ No newline at end of file diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..a502cdeb --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,27 @@ +P3.txt + +The platforms detected are: +--------------------------- +Apple Apple version: OpenCL 1.2 (May 10 2015 19:38:45) +The devices detected on platform Apple are: +--------------------------- +Intel(R) Core(TM) i5-4260U CPU @ 1.40GHz [Type: CPU ] +Maximum clock Frequency: 1400 MHz +Maximum allocable memory size: 2147 MB +Maximum work group size 1024 +Maximum work item dimensions 3 +Maximum work item size [1024, 1, 1] +--------------------------- +HD Graphics 5000 [Type: GPU ] +Maximum clock Frequency: 1000 MHz +Maximum allocable memory size: 402 MB +Maximum work group size 512 +Maximum work item dimensions 3 +Maximum work item size [512, 512, 512] +--------------------------- + +configuration ('coalesced', 128, 128): 0.00316304 seconds + +BBased on my configuration (Mac Air ~2014) the coalesced read and sum with 128 work_groups and 128 workers came out as the winner. Based on the coalesced plot that I created for the output data, it is easy to see that as the number of work groups increases, the execution time of the sums decreases. This indicates that having a larger work group size helps increase performance when reading from memory in the coalesced manner. + +The block sum plot tells a different story. It seems that the optimal group size is 8 and that as the number of workers increases so does the performance. However, as we begin to increase the amount of work groups as well as workers, there is actually a decrease in performance. This maybe because the stride of k based on the global_size going out of memory bounds, which results in less work done per iteration and therefore decreased performance. \ No newline at end of file diff --git a/HW3/P3/P3_plot.py b/HW3/P3/P3_plot.py new file mode 100644 index 00000000..7ed6db33 --- /dev/null +++ b/HW3/P3/P3_plot.py @@ -0,0 +1,42 @@ +# P3_plot.py +# Plot Output +import numpy as np +import matplotlib.pyplot as plt + +if __name__ == '__main__': + # Read in data + results = np.loadtxt('HW3_P3_results2.txt', delimiter=',', dtype=str) + + # Split data + execution_time = [x[-1].split()[0] for x in results] + workgroups = [x[1].split()[1] for x in results] + num_workers = [x[2].split()[1] for x in results] + + ### Plot coalesced ### + plt.figure(figsize=(10,8)) + + for x in xrange(6, 43, 6): + plt.plot(num_workers[x-6:x], execution_time[x-6:x], label='Work Groups: %2s'%(workgroups[x-1])) + plt.scatter(num_workers[x-6:x], execution_time[x-6:x]) + + plt.legend() + plt.title("Coalesced Sums Execution Time") + plt.ylabel("Execution Time (seconds)") + plt.xlabel("Number of Workers") + plt.ylim(-.01, .18) + plt.xlim(-2,132) + plt.show() + + ### Plot Block ### + plt.figure(figsize=(10,8)) + for x in xrange(int(84/2)+6, 85, 6): + plt.plot(num_workers[x-6:x], execution_time[x-6:x], label='Work Groups: %2s'%(workgroups[x-1])) + plt.scatter(num_workers[x-6:x], execution_time[x-6:x]) + + plt.legend() + plt.title("Blocked Reads Execution Time") + plt.ylabel("Execution Time (seconds)") + plt.xlabel("Number of Workers") + plt.ylim(-.02,.2) + plt.xlim(-2,132) + plt.show() \ No newline at end of file diff --git a/HW3/P3/blocked.png b/HW3/P3/blocked.png new file mode 100644 index 00000000..a64a6364 Binary files /dev/null and b/HW3/P3/blocked.png differ diff --git a/HW3/P3/coalesced.png b/HW3/P3/coalesced.png new file mode 100644 index 00000000..7039f9a6 Binary files /dev/null and b/HW3/P3/coalesced.png differ diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..436a2541 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -3,13 +3,24 @@ __kernel void sum_coalesced(__global float* x, __local float* fast, long N) { - float sum = 0; - size_t local_id = get_local_id(0); + + // Get global id and size + size_t i = get_global_id(0); + size_t global_size = get_global_size(0); + // Get local id and size + size_t local_id = get_local_id(0); + size_t local_size = get_local_size(0); + + // Initialize sum + float sum = 0; + // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (uint k = 0; i + k*global_size < N-1; k++) { + + // Sum x[i] + x[i + stride] + x[i + 2*stride] + ... + sum += x[i + k*global_size]; } fast[local_id] = sum; @@ -24,8 +35,13 @@ __kernel void sum_coalesced(__global float* x, // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (uint stride=local_size/2; stride>0; stride >>= 1) { + // If the local_id is smaller than the stride then continue + if (local_id < stride) + fast[local_id] += fast[local_id + stride]; + + // Add barrier to ensure that all adds are complete + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; @@ -36,9 +52,17 @@ __kernel void sum_blocked(__global float* x, __local float* fast, long N) { - float sum = 0; + // Initialize get_x variables size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + uint local_size = get_local_size(0); + + // Get global ID + uint i = get_global_id(0); + int global_size = get_global_size(0); + + // Initialize L = ceiling(N/global_size) and sum + int L = ceil((float)N / global_size); + float sum = 0; // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 @@ -48,8 +72,10 @@ __kernel void sum_blocked(__global float* x, // // Be careful that each thread stays in bounds, both relative to // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int k=i*L; k <= (i+1)*L-1; k++) { + // Ensure that k stays in bounds relative to N + if (k < N) + sum += x[k]; } fast[local_id] = sum; @@ -61,11 +87,14 @@ __kernel void sum_blocked(__global float* x, // in fast[i], for offset = (local_size >> j) for j from 1 to // log_2(local_size) // + // You can assume get_local_size(0) is a power of 2. // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for (int stride=local_size/2; stride>0; stride >>= 1) { + if (local_id < stride) + fast[local_id] += fast[local_id + stride]; + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..a0d56da2 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -23,7 +23,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +40,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..53ab27a6 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -12,7 +12,26 @@ median_3x3(__global __read_only float *in_values, // Note: It may be easier for you to implement median filtering // without using the local buffer, first, then adjust your code to // use such a buffer after you have that working. + + // Initialize the Local Position + // x = rows, y = cols + const int lx = get_local_id(0); + const int ly = get_local_id(1); + + // Now the Global Position + const int gx = get_global_id(0); + const int gy = get_global_id(1); + // Local buffer top left pixel reference + const int buf_corner_x = gx - lx - halo; + const int buf_corner_y = gy - ly - halo; + + // Local buffer location of current pixel + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + // 1D index of thread within our work-group + const int idx_1D = ly * get_local_size(0) + lx; // Load into buffer (with 1-pixel halo). // @@ -21,14 +40,50 @@ median_3x3(__global __read_only float *in_values, // // Note that globally out-of-bounds pixels should be replaced // with the nearest valid pixel's value. + + // Stay within image boundary + if (idx_1D < buf_w){ + + // Loop through rows + for (int row=0; row < buf_h; row++){ + + // Save global reference based on buffer location + int x_ref = buf_corner_x + idx_1D; + int y_ref = buf_corner_y + row; + + // Load buffer based on global location + // If y/x < 0, take 0 and if > h/w take h/w + // Kevin Chen helped me visual the min, max function + buffer[row * buf_w + idx_1D] = in_values[ min( max(0, y_ref), (h-halo) ) * w \ + + min( max(0, x_ref), (w-halo) ) ]; + } + } + + // Ensure all buffer loads are complete + barrier(CLK_LOCAL_MEM_FENCE); // Compute 3x3 median for each pixel in core (non-halo) pixels // // We've given you median9.h, and included it above, so you can // use the median9() function. + float filter_result; + // Ensure that global id stays within w, h + if ( (gx