From 5a3affa22d3cd170da0aec5fadadf57e6509d201 Mon Sep 17 00:00:00 2001 From: Alex Zatsman Date: Mon, 2 Jan 2017 12:20:42 -0500 Subject: [PATCH 1/4] Configuration exposing the bug in Issue 61 Added overloads for 'error' and 'results' with two matrices being compared for equality. Changed matrix initializations from constant values to non-constant ones. Stored the result from the sequantial host-based matrix multiplication in a separate array for later comparison. --- Solutions/Exercise08/Cpp/matmul.cpp | 20 ++++---- Solutions/Exercise08/Cpp/matrix_lib.cpp | 62 ++++++++++++++++++++----- Solutions/Exercise08/Cpp/matrix_lib.hpp | 3 ++ 3 files changed, 65 insertions(+), 20 deletions(-) diff --git a/Solutions/Exercise08/Cpp/matmul.cpp b/Solutions/Exercise08/Cpp/matmul.cpp index c5f48ab..0cf85f8 100644 --- a/Solutions/Exercise08/Cpp/matmul.cpp +++ b/Solutions/Exercise08/Cpp/matmul.cpp @@ -45,9 +45,13 @@ int main(int argc, char *argv[]) std::vector h_A(size); // Host memory for Matrix A std::vector h_B(size); // Host memory for Matrix B std::vector h_C(size); // Host memory for Matrix C + std::vector C0 (size); // Result computed sequantially on the host for + // later error checking. cl::Buffer d_a, d_b, d_c; // Matrices in device memory + + //-------------------------------------------------------------------------------- // Create a context and queue //-------------------------------------------------------------------------------- @@ -84,7 +88,7 @@ int main(int argc, char *argv[]) // Run sequential matmul //-------------------------------------------------------------------------------- - initmat(N, h_A, h_B, h_C); + initmat(N, h_A, h_B, C0); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) @@ -92,10 +96,10 @@ int main(int argc, char *argv[]) zero_mat(N, h_C); start_time = static_cast(timer.getTimeMilliseconds()) / 1000.0; - seq_mat_mul_sdot(N, h_A, h_B, h_C); + seq_mat_mul_sdot(N, h_A, h_B, C0); run_time = static_cast(timer.getTimeMilliseconds()) / 1000.0 - start_time; - results(N, h_C, run_time); + results(N, C0, C0, run_time); } //-------------------------------------------------------------------------------- @@ -144,7 +148,7 @@ int main(int argc, char *argv[]) cl::copy(queue, d_c, h_C.begin(), h_C.end()); - results(N, h_C, run_time); + results(N, h_C, C0, run_time); } // end for loop @@ -177,7 +181,7 @@ int main(int argc, char *argv[]) cl::copy(queue, d_c, h_C.begin(), h_C.end()); - results(N, h_C, run_time); + results(N, h_C, C0, run_time); } // end for loop @@ -211,7 +215,7 @@ int main(int argc, char *argv[]) cl::copy(queue, d_c, h_C.begin(), h_C.end()); - results(N, h_C, run_time); + results(N, h_C, C0, run_time); } // end for loop @@ -248,7 +252,7 @@ int main(int argc, char *argv[]) cl::copy(queue, d_c, h_C.begin(), h_C.end()); - results(N, h_C, run_time); + results(N, h_C, C0, run_time); } // end for loop @@ -297,7 +301,7 @@ int main(int argc, char *argv[]) cl::copy(queue, d_c, h_C.begin(), h_C.end()); - results(N, h_C, run_time); + results(N, h_C, C0, run_time); } // end for loop } catch (cl::Error err) diff --git a/Solutions/Exercise08/Cpp/matrix_lib.cpp b/Solutions/Exercise08/Cpp/matrix_lib.cpp index d632420..f621b25 100644 --- a/Solutions/Exercise08/Cpp/matrix_lib.cpp +++ b/Solutions/Exercise08/Cpp/matrix_lib.cpp @@ -43,26 +43,34 @@ void seq_mat_mul_sdot(int N, std::vector& A, std::vector& B, std:: //------------------------------------------------------------------------------ // -// Function to initialize the input matrices A and B +// Function to initialize the input matrices A and B. +// Matrices are initialized to small but non-constant values. The values need +// to be relatively small to avoid signle-precision floating point errors +// in long sums. // //------------------------------------------------------------------------------ + + + void initmat(int N, std::vector& A, std::vector& B, std::vector& C) { - int i, j; + int i, j; - /* Initialize matrices */ + /* Initialize matrices */ - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - A[i*N+j] = AVAL; + int vv = 1.0; - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - B[i*N+j] = BVAL; + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + A[i*N+j] = static_cast((vv++) % 17); - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - C[i*N+j] = 0.0f; + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + B[i*N+j] = static_cast((vv++) % 11); + + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + C[i*N+j] = static_cast((vv++) % 19); } //------------------------------------------------------------------------------ @@ -98,6 +106,7 @@ void trans(int N, std::vector& B, std::vector& Btrans) // Function to compute errors of the product matrix // //------------------------------------------------------------------------------ + float error(int N, std::vector& C) { int i,j; @@ -114,6 +123,23 @@ float error(int N, std::vector& C) return errsq; } +// Compare two matrices, which are expected to be identical, and return the error: + +float error (int N, std::vector& C1, std::vector& C2) +{ + int i,j; + float cval, errsq, err; + cval = (float) N * AVAL * BVAL; + errsq = 0.0f; + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + err = C1[i*N+j] - C2[i*N+j]; + errsq += err * err; + } + } + return errsq; +} + //------------------------------------------------------------------------------ // // Function to analyze and output results @@ -132,3 +158,15 @@ void results(int N, std::vector& C, double run_time) printf("\n Errors in multiplication: %f\n",errsq); } +// Compare two matrices: + +void results (int N, std::vector& C1, std::vector& C2, double run_time) +{ + float mflops; + float errsq; + mflops = 2.0 * N * N * N/(1000000.0f * run_time); + printf(" %.2f seconds at %.1f MFLOPS \n", run_time,mflops); + errsq = error(N, C1, C2); + if (std::isnan(errsq) || errsq > TOL) + printf("\n Errors in multiplication: %f\n",errsq); +} diff --git a/Solutions/Exercise08/Cpp/matrix_lib.hpp b/Solutions/Exercise08/Cpp/matrix_lib.hpp index d6d2fd2..209a33f 100644 --- a/Solutions/Exercise08/Cpp/matrix_lib.hpp +++ b/Solutions/Exercise08/Cpp/matrix_lib.hpp @@ -55,6 +55,9 @@ float error(int N, std::vector& C); // Function to analyze and output results // //------------------------------------------------------------------------------ + void results(int N, std::vector& C, double run_time); +void results(int N, std::vector& C1, std::vector& C2, double run_time); + #endif From c380e0843f4e4c27ee597b0679375fbaa144f546 Mon Sep 17 00:00:00 2001 From: Alex Zatsman Date: Mon, 2 Jan 2017 12:22:52 -0500 Subject: [PATCH 2/4] Fixed the bug in C_block_from.cl (Issue 61) It seems that the previous version was computing a transposed result. This change made the execution on my machine 4 times slower -- no clue why. But it passes the test. --- Solutions/Exercise08/C_block_form.cl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Solutions/Exercise08/C_block_form.cl b/Solutions/Exercise08/C_block_form.cl index 8d3956a..b92bbca 100755 --- a/Solutions/Exercise08/C_block_form.cl +++ b/Solutions/Exercise08/C_block_form.cl @@ -99,7 +99,7 @@ __kernel void mmul( // the contribution to C(i,j) from this block #pragma unroll for (kloc=0; kloc Date: Mon, 7 Aug 2017 23:46:59 -0400 Subject: [PATCH 3/4] Restored from 6f3e21d9 after the merge. --- Solutions/Exercise08/C_block_form.cl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Solutions/Exercise08/C_block_form.cl b/Solutions/Exercise08/C_block_form.cl index 5c840e4..fa65664 100755 --- a/Solutions/Exercise08/C_block_form.cl +++ b/Solutions/Exercise08/C_block_form.cl @@ -99,7 +99,7 @@ __kernel void mmul( // the contribution to C(i,j) from this block #pragma unroll for (kloc=0; kloc Date: Tue, 8 Aug 2017 10:52:07 -0400 Subject: [PATCH 4/4] Modified initialization and comparison functions in C and Python for more rigorous testing of the results. --- .gitignore | 1 + Solutions/Exercise08/C/matmul.c | 19 +++++---- Solutions/Exercise08/C/matrix_lib.c | 51 +++++++++++++------------ Solutions/Exercise08/C/matrix_lib.h | 4 +- Solutions/Exercise08/Cpp/matrix_lib.cpp | 2 +- Solutions/Exercise08/Python/helper.py | 34 +++++++++++------ Solutions/Exercise08/Python/matmul.py | 29 +++++++------- 7 files changed, 79 insertions(+), 61 deletions(-) diff --git a/.gitignore b/.gitignore index cea69de..b030cf6 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ # C *.o +*~ # Produced binarys Exercises/Exercise01/C/DeviceInfo diff --git a/Solutions/Exercise08/C/matmul.c b/Solutions/Exercise08/C/matmul.c index 72439cb..cc4de21 100644 --- a/Solutions/Exercise08/C/matmul.c +++ b/Solutions/Exercise08/C/matmul.c @@ -34,6 +34,8 @@ int main(int argc, char *argv[]) float *h_A; // A matrix float *h_B; // B matrix float *h_C; // C = A*B matrix + float *C0; // Result computed sequantially on the host for + // later error checking. int N; // A[N][N], B[N][N], C[N][N] int size; // number of elements in each matrix @@ -58,7 +60,7 @@ int main(int argc, char *argv[]) h_A = (float *)malloc(size * sizeof(float)); h_B = (float *)malloc(size * sizeof(float)); h_C = (float *)malloc(size * sizeof(float)); - + C0 = (float *)malloc(size * sizeof(float)); //-------------------------------------------------------------------------------- @@ -105,10 +107,10 @@ int main(int argc, char *argv[]) zero_mat(N, h_C); start_time = wtime(); - seq_mat_mul_sdot(N, h_A, h_B, h_C); + seq_mat_mul_sdot(N, h_A, h_B, C0); run_time = wtime() - start_time; - results(N, h_C, run_time); + results(N, C0, C0, run_time); } //-------------------------------------------------------------------------------- @@ -195,7 +197,7 @@ int main(int argc, char *argv[]) 0, NULL, NULL); checkError(err, "Reading back d_c"); - results(N, h_C, run_time); + results(N, h_C, C0, run_time); } // end for loop @@ -262,7 +264,7 @@ int main(int argc, char *argv[]) 0, NULL, NULL); checkError(err, "Reading back d_c"); - results(N, h_C, run_time); + results(N, h_C, C0, run_time); } // end for loop @@ -331,7 +333,7 @@ int main(int argc, char *argv[]) 0, NULL, NULL); checkError(err, "Reading back d_c"); - results(N, h_C, run_time); + results(N, h_C, C0, run_time); } // end for loop @@ -401,7 +403,7 @@ int main(int argc, char *argv[]) 0, NULL, NULL); checkError(err, "Reading back d_c"); - results(N, h_C, run_time); + results(N, h_C, C0, run_time); } // end for loop @@ -478,7 +480,7 @@ int main(int argc, char *argv[]) 0, NULL, NULL); checkError(err, "Reading back d_c"); - results(N, h_C, run_time); + results(N, h_C, C0, run_time); } // end for loop @@ -489,6 +491,7 @@ int main(int argc, char *argv[]) free(h_A); free(h_B); free(h_C); + free(C0); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); diff --git a/Solutions/Exercise08/C/matrix_lib.c b/Solutions/Exercise08/C/matrix_lib.c index e77587f..92a3949 100644 --- a/Solutions/Exercise08/C/matrix_lib.c +++ b/Solutions/Exercise08/C/matrix_lib.c @@ -47,21 +47,23 @@ void seq_mat_mul_sdot(int N, float *A, float *B, float *C) //------------------------------------------------------------------------------ void initmat(int N, float *A, float *B, float *C) { - int i, j; + int i, j; - /* Initialize matrices */ + /* Initialize matrices */ - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - A[i*N+j] = AVAL; + int vv = 1; - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - B[i*N+j] = BVAL; + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + A[i*N+j] = (float) ((vv++) % 17); - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - C[i*N+j] = 0.0f; + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + B[i*N+j] = (float) ((vv++) % 11); + + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + C[i*N+j] = (float) ((vv++) % 19); } //------------------------------------------------------------------------------ @@ -97,20 +99,19 @@ void trans(int N, float *B, float *Btrans) // Function to compute errors of the product matrix // //------------------------------------------------------------------------------ -float error(int N, float *C) +float error(int N, float *C1, float *C2) { - int i,j; - float cval, errsq, err; - cval = (float) N * AVAL * BVAL; - errsq = 0.0f; - - for (i = 0; i < N; i++) { - for (j = 0; j < N; j++) { - err = C[i*N+j] - cval; - errsq += err * err; - } + int i,j; + float cval, errsq, err; + cval = (float) N * AVAL * BVAL; + errsq = 0.0f; + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + err = C1[i*N+j] - C2[i*N+j]; + errsq += err * err; } - return errsq; + } + return errsq; } //------------------------------------------------------------------------------ @@ -118,14 +119,14 @@ float error(int N, float *C) // Function to analyze and output results // //------------------------------------------------------------------------------ -void results(int N, float *C, double run_time) +void results(int N, float *C1, float *C2, double run_time) { float mflops; float errsq; mflops = 2.0 * N * N * N/(1000000.0f * run_time); printf(" %.2f seconds at %.1f MFLOPS \n", run_time,mflops); - errsq = error(N, C); + errsq = error(N, C1, C2); if (isnan(errsq) || errsq > TOL) { printf("\n Errors in multiplication: %f\n",errsq); exit(1); diff --git a/Solutions/Exercise08/C/matrix_lib.h b/Solutions/Exercise08/C/matrix_lib.h index 197321a..99057b1 100644 --- a/Solutions/Exercise08/C/matrix_lib.h +++ b/Solutions/Exercise08/C/matrix_lib.h @@ -46,7 +46,7 @@ void trans(int N, float *B, float *Btrans); // Function to compute errors of the product matrix // //------------------------------------------------------------------------------ -float error(int N, float *C); +float error(int N, float *C1, float *C2); //------------------------------------------------------------------------------ @@ -54,6 +54,6 @@ float error(int N, float *C); // Function to analyze and output results // //------------------------------------------------------------------------------ -void results(int N, float *C, double run_time); +void results(int N, float *C1, float *C2, double run_time); #endif diff --git a/Solutions/Exercise08/Cpp/matrix_lib.cpp b/Solutions/Exercise08/Cpp/matrix_lib.cpp index f621b25..233c42c 100644 --- a/Solutions/Exercise08/Cpp/matrix_lib.cpp +++ b/Solutions/Exercise08/Cpp/matrix_lib.cpp @@ -58,7 +58,7 @@ void initmat(int N, std::vector& A, std::vector& B, std::vector TOL): print "Errors in multiplication:", errsq + +## Return 1D array of length N*N initialized to non-constant small values. +## Second variable 'm' is used to allow different initializations for different m's. +## Current implementation simply initializes matrices to sequential values modulo 'm', +## but one can come up with a more randomized scheme, like using 'm' for seeding +## random number generator. + +def initMat (N, m): + return numpy.arange(N*N, dtype=numpy.float32) % m; diff --git a/Solutions/Exercise08/Python/matmul.py b/Solutions/Exercise08/Python/matmul.py index 3a9e745..75c0edb 100644 --- a/Solutions/Exercise08/Python/matmul.py +++ b/Solutions/Exercise08/Python/matmul.py @@ -30,17 +30,20 @@ size = N * N + # A matrix -h_A = numpy.empty(size).astype(numpy.float32) -h_A.fill(AVAL) +h_A = initMat (N, 7) # B matrix -h_B = numpy.empty(size).astype(numpy.float32) -h_B.fill(BVAL) +h_B = initMat (N, 11) # C matrix h_C = numpy.empty(size).astype(numpy.float32) +## C0 is the product of h_A and h_B for later comparisons: + +C0 = numpy.matmul(h_A.reshape([N,N]), h_B.reshape([N,N])).reshape(N*N); + print "\n===== Sequential, matrix mult (dot prod), order", ORDER, "on host CPU ======\n" for i in range(COUNT): @@ -59,11 +62,10 @@ queue = cl.CommandQueue(context) # Reset host buffers - just to play it safe -h_A = numpy.empty(size).astype(numpy.float32) -h_A.fill(AVAL) -h_B = numpy.empty(size).astype(numpy.float32) -h_B.fill(BVAL) +h_A = initMat (N, 7) +h_B = initMat (N, 11); h_C = numpy.empty(size).astype(numpy.float32) +C0 = numpy.matmul(h_A.reshape([N,N]), h_B.reshape([N,N])).reshape(N*N); #-------------------------------------------------------------------------------- @@ -92,7 +94,7 @@ run_time = time() - start_time cl.enqueue_copy(queue, h_C, d_c) - results(N, h_C, run_time) + results(N, h_C, C0, run_time) #-------------------------------------------------------------------------------- # OpenCL matrix multiplication ... C row per work item @@ -114,7 +116,7 @@ run_time = time() - start_time cl.enqueue_copy(queue, h_C, d_c) - results(N, h_C, run_time) + results(N, h_C, C0, run_time) #-------------------------------------------------------------------------------- # OpenCL matrix multiplication ... C row per work item, A row in pivate memory @@ -136,7 +138,7 @@ run_time = time() - start_time cl.enqueue_copy(queue, h_C, d_c) - results(N, h_C, run_time) + results(N, h_C, C0, run_time) #-------------------------------------------------------------------------------- # OpenCL matrix multiplication ... C row per work item, A row pivate, B col local @@ -160,7 +162,7 @@ run_time = time() - start_time cl.enqueue_copy(queue, h_C, d_c) - results(N, h_C, run_time) + results(N, h_C, C0, run_time) #-------------------------------------------------------------------------------- # OpenCL matrix multiplication ... blocked @@ -190,5 +192,4 @@ run_time = time() - start_time cl.enqueue_copy(queue, h_C, d_c) - results(N, h_C, run_time) - + results(N, h_C, C0, run_time)