From 5a3affa22d3cd170da0aec5fadadf57e6509d201 Mon Sep 17 00:00:00 2001
From: Alex Zatsman <ztsmn@yahoo.com>
Date: Mon, 2 Jan 2017 12:20:42 -0500
Subject: [PATCH 1/4] Configuration exposing the bug in Issue 61

Added overloads for 'error' and 'results' with two matrices being
compared for equality.

Changed matrix initializations from constant values to non-constant
ones.

Stored the result from the sequantial host-based matrix multiplication
in a separate array for later comparison.
---
 Solutions/Exercise08/Cpp/matmul.cpp     | 20 ++++----
 Solutions/Exercise08/Cpp/matrix_lib.cpp | 62 ++++++++++++++++++++-----
 Solutions/Exercise08/Cpp/matrix_lib.hpp |  3 ++
 3 files changed, 65 insertions(+), 20 deletions(-)
diff --git a/Solutions/Exercise08/Cpp/matmul.cpp b/Solutions/Exercise08/Cpp/matmul.cpp
index c5f48ab..0cf85f8 100644
--- a/Solutions/Exercise08/Cpp/matmul.cpp
+++ b/Solutions/Exercise08/Cpp/matmul.cpp
@@ -45,9 +45,13 @@ int main(int argc, char *argv[])
     std::vector<float> h_A(size); // Host memory for Matrix A
     std::vector<float> h_B(size); // Host memory for Matrix B
     std::vector<float> h_C(size); // Host memory for Matrix C
+    std::vector<float> C0 (size); // Result computed sequantially on the host for
+                                  //   later error checking.
 
     cl::Buffer d_a, d_b, d_c;   // Matrices in device memory
 
+
+
 //--------------------------------------------------------------------------------
 // Create a context and queue
 //--------------------------------------------------------------------------------
@@ -84,7 +88,7 @@ int main(int argc, char *argv[])
 // Run sequential matmul
 //--------------------------------------------------------------------------------
 
-        initmat(N, h_A, h_B, h_C);
+        initmat(N, h_A, h_B, C0);
 
         printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
         for(int i = 0; i < COUNT; i++)
@@ -92,10 +96,10 @@ int main(int argc, char *argv[])
             zero_mat(N, h_C);
             start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;
 
-            seq_mat_mul_sdot(N, h_A, h_B, h_C);
+            seq_mat_mul_sdot(N, h_A, h_B, C0);
 
             run_time  = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time;
-            results(N, h_C, run_time);
+            results(N, C0, C0, run_time);
         }
 
 //--------------------------------------------------------------------------------
@@ -144,7 +148,7 @@ int main(int argc, char *argv[])
 
             cl::copy(queue, d_c, h_C.begin(), h_C.end());
 
-            results(N, h_C, run_time);
+            results(N, h_C, C0, run_time);
 
         } // end for loop
 
@@ -177,7 +181,7 @@ int main(int argc, char *argv[])
 
             cl::copy(queue, d_c, h_C.begin(), h_C.end());
 
-            results(N, h_C, run_time);
+            results(N, h_C, C0, run_time);
 
         } // end for loop
 
@@ -211,7 +215,7 @@ int main(int argc, char *argv[])
 
             cl::copy(queue, d_c, h_C.begin(), h_C.end());
 
-            results(N, h_C, run_time);
+            results(N, h_C, C0, run_time);
 
         } // end for loop
 
@@ -248,7 +252,7 @@ int main(int argc, char *argv[])
 
             cl::copy(queue, d_c, h_C.begin(), h_C.end());
 
-            results(N, h_C, run_time);
+            results(N, h_C, C0, run_time);
 
         } // end for loop
 
@@ -297,7 +301,7 @@ int main(int argc, char *argv[])
 
             cl::copy(queue, d_c, h_C.begin(), h_C.end());
 
-            results(N, h_C, run_time);
+            results(N, h_C, C0, run_time);
 
         } // end for loop
     } catch (cl::Error err)
diff --git a/Solutions/Exercise08/Cpp/matrix_lib.cpp b/Solutions/Exercise08/Cpp/matrix_lib.cpp
index d632420..f621b25 100644
--- a/Solutions/Exercise08/Cpp/matrix_lib.cpp
+++ b/Solutions/Exercise08/Cpp/matrix_lib.cpp
@@ -43,26 +43,34 @@ void seq_mat_mul_sdot(int N, std::vector<float>& A, std::vector<float>& B, std::
 
 //------------------------------------------------------------------------------
 //
-//  Function to initialize the input matrices A and B
+//  Function to initialize the input matrices A and B.
+//  Matrices are initialized to small but non-constant values. The values need
+//    to be relatively small to avoid signle-precision floating point errors
+//    in long sums.
 //
 //------------------------------------------------------------------------------
+
+
+
 void initmat(int N, std::vector<float>& A, std::vector<float>& B, std::vector<float>& C)
 {
-    int i, j;
+  int i, j;
 
-    /* Initialize matrices */
+  /* Initialize matrices */
 
-    for (i = 0; i < N; i++)
-        for (j = 0; j < N; j++)
-            A[i*N+j] = AVAL;
+  int vv = 1.0;
 
-    for (i = 0; i < N; i++)
-        for (j = 0; j < N; j++)
-            B[i*N+j] = BVAL;
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      A[i*N+j] = static_cast<float>((vv++) % 17);
 
-    for (i = 0; i < N; i++)
-        for (j = 0; j < N; j++)
-            C[i*N+j] = 0.0f;
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      B[i*N+j] = static_cast<float>((vv++) % 11);
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      C[i*N+j] = static_cast<float>((vv++) % 19);
 }
 
 //------------------------------------------------------------------------------
@@ -98,6 +106,7 @@ void trans(int N, std::vector<float>& B, std::vector<float>& Btrans)
 //  Function to compute errors of the product matrix
 //
 //------------------------------------------------------------------------------
+
 float error(int N, std::vector<float>& C)
 {
    int i,j;
@@ -114,6 +123,23 @@ float error(int N, std::vector<float>& C)
     return errsq;
 }
 
+// Compare two matrices, which are expected to be identical, and return the error:
+
+float error (int N, std::vector<float>& C1, std::vector<float>& C2)
+{
+  int i,j;
+  float cval, errsq, err;
+  cval  = (float) N * AVAL * BVAL;
+  errsq = 0.0f;
+  for   (i = 0; i < N; i++) {
+    for (j = 0; j < N; j++) {
+      err = C1[i*N+j] - C2[i*N+j];
+      errsq += err * err;
+    }
+  }
+  return errsq;
+}
+
 //------------------------------------------------------------------------------
 //
 //  Function to analyze and output results
@@ -132,3 +158,15 @@ void results(int N, std::vector<float>& C, double run_time)
            printf("\n Errors in multiplication: %f\n",errsq);
 }
 
+// Compare two matrices:
+
+void results (int N, std::vector<float>& C1, std::vector<float>& C2, double run_time)
+{
+  float mflops;
+  float errsq;
+  mflops = 2.0 * N * N * N/(1000000.0f * run_time);
+  printf(" %.2f seconds at %.1f MFLOPS \n",  run_time,mflops);
+  errsq = error(N, C1, C2);
+  if (std::isnan(errsq) || errsq > TOL)
+    printf("\n Errors in multiplication: %f\n",errsq);
+}
diff --git a/Solutions/Exercise08/Cpp/matrix_lib.hpp b/Solutions/Exercise08/Cpp/matrix_lib.hpp
index d6d2fd2..209a33f 100644
--- a/Solutions/Exercise08/Cpp/matrix_lib.hpp
+++ b/Solutions/Exercise08/Cpp/matrix_lib.hpp
@@ -55,6 +55,9 @@ float error(int N, std::vector<float>& C);
 //  Function to analyze and output results 
 //
 //------------------------------------------------------------------------------
+
 void results(int N, std::vector<float>& C, double run_time);
     
+void results(int N, std::vector<float>& C1, std::vector<float>& C2, double run_time);
+    
 #endif

From c380e0843f4e4c27ee597b0679375fbaa144f546 Mon Sep 17 00:00:00 2001
From: Alex Zatsman <ztsmn@yahoo.com>
Date: Mon, 2 Jan 2017 12:22:52 -0500
Subject: [PATCH 2/4] Fixed the bug in C_block_from.cl (Issue 61)

It seems that the previous version was computing a transposed result.

This change made the execution on my machine 4 times slower -- no clue
why. But it passes the test.
---
 Solutions/Exercise08/C_block_form.cl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Solutions/Exercise08/C_block_form.cl b/Solutions/Exercise08/C_block_form.cl
index 8d3956a..b92bbca 100755
--- a/Solutions/Exercise08/C_block_form.cl
+++ b/Solutions/Exercise08/C_block_form.cl
@@ -99,7 +99,7 @@ __kernel void mmul(
        // the contribution to C(i,j) from this block
        #pragma unroll
        for (kloc=0; kloc<blksz; kloc++)
-          Ctmp += Awrk[jloc*blksz+kloc] * Bwrk[kloc*blksz+iloc];
+          Ctmp += Awrk[iloc*blksz+kloc] * Bwrk[kloc*blksz+jloc];
 
        barrier(CLK_LOCAL_MEM_FENCE);
        Abase += Ainc;
@@ -107,6 +107,6 @@ __kernel void mmul(
     }
  
     // update global C matrix 
-    C[j*N+i] = Ctmp;
+    C[i*N+j] = Ctmp;
 
 }

From 67445b7450381e286a9d3e956ac013a31423a66f Mon Sep 17 00:00:00 2001
From: Alex Zatsman <ztsmn@yahoo.com>
Date: Mon, 7 Aug 2017 23:46:59 -0400
Subject: [PATCH 3/4] Restored from 6f3e21d9 after the merge.

---
 Solutions/Exercise08/C_block_form.cl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Solutions/Exercise08/C_block_form.cl b/Solutions/Exercise08/C_block_form.cl
index 5c840e4..fa65664 100755
--- a/Solutions/Exercise08/C_block_form.cl
+++ b/Solutions/Exercise08/C_block_form.cl
@@ -99,7 +99,7 @@ __kernel void mmul(
        // the contribution to C(i,j) from this block
        #pragma unroll
        for (kloc=0; kloc<blksz; kloc++)
-          Ctmp += Awrk[iloc*blksz+kloc] * Bwrk[kloc*blksz+jloc];
+          Ctmp += Awrk[jloc*blksz+kloc] * Bwrk[kloc*blksz+iloc];
 
        barrier(CLK_LOCAL_MEM_FENCE);
        Abase += Ainc;
@@ -107,6 +107,6 @@ __kernel void mmul(
     }
  
     // update global C matrix 
-    C[i*N+j] = Ctmp;
+    C[j*N+i] = Ctmp;
 
 }

From f99afd541bc3e7a3f244be75de568e4b710cb075 Mon Sep 17 00:00:00 2001
From: Alex Zatsman <ztsmn@yahoo.com>
Date: Tue, 8 Aug 2017 10:52:07 -0400
Subject: [PATCH 4/4] Modified initialization and comparison functions in C and
 Python for more rigorous testing of the results.

---
 .gitignore                              |  1 +
 Solutions/Exercise08/C/matmul.c         | 19 +++++----
 Solutions/Exercise08/C/matrix_lib.c     | 51 +++++++++++++------------
 Solutions/Exercise08/C/matrix_lib.h     |  4 +-
 Solutions/Exercise08/Cpp/matrix_lib.cpp |  2 +-
 Solutions/Exercise08/Python/helper.py   | 34 +++++++++++------
 Solutions/Exercise08/Python/matmul.py   | 29 +++++++-------
 7 files changed, 79 insertions(+), 61 deletions(-)

diff --git a/.gitignore b/.gitignore
index cea69de..b030cf6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 
 # C
 *.o
+*~
 
 # Produced binarys
 Exercises/Exercise01/C/DeviceInfo
diff --git a/Solutions/Exercise08/C/matmul.c b/Solutions/Exercise08/C/matmul.c
index 72439cb..cc4de21 100644
--- a/Solutions/Exercise08/C/matmul.c
+++ b/Solutions/Exercise08/C/matmul.c
@@ -34,6 +34,8 @@ int main(int argc, char *argv[])
     float *h_A;             // A matrix
     float *h_B;             // B matrix
     float *h_C;             // C = A*B matrix
+    float *C0;              // Result computed sequantially on the host for
+                            //   later error checking.
     int N;                  // A[N][N], B[N][N], C[N][N]
     int size;               // number of elements in each matrix
 
@@ -58,7 +60,7 @@ int main(int argc, char *argv[])
     h_A = (float *)malloc(size * sizeof(float));
     h_B = (float *)malloc(size * sizeof(float));
     h_C = (float *)malloc(size * sizeof(float));
-
+    C0  = (float *)malloc(size * sizeof(float));
 
 
 //--------------------------------------------------------------------------------
@@ -105,10 +107,10 @@ int main(int argc, char *argv[])
         zero_mat(N, h_C);
         start_time = wtime();
 
-        seq_mat_mul_sdot(N, h_A, h_B, h_C);
+        seq_mat_mul_sdot(N, h_A, h_B, C0);
 
         run_time  = wtime() - start_time;
-        results(N, h_C, run_time);
+        results(N, C0, C0, run_time);
     }
 
 //--------------------------------------------------------------------------------
@@ -195,7 +197,7 @@ int main(int argc, char *argv[])
             0, NULL, NULL);
         checkError(err, "Reading back d_c");
 
-        results(N, h_C, run_time);
+        results(N, h_C, C0, run_time);
 
     } // end for loop
 
@@ -262,7 +264,7 @@ int main(int argc, char *argv[])
             0, NULL, NULL);
         checkError(err, "Reading back d_c");
 
-        results(N, h_C, run_time);
+        results(N, h_C, C0, run_time);
 
     } // end for loop
 
@@ -331,7 +333,7 @@ int main(int argc, char *argv[])
             0, NULL, NULL);
         checkError(err, "Reading back d_c");
 
-        results(N, h_C, run_time);
+        results(N, h_C, C0, run_time);
 
     } // end for loop
 
@@ -401,7 +403,7 @@ int main(int argc, char *argv[])
             0, NULL, NULL);
         checkError(err, "Reading back d_c");
 
-        results(N, h_C, run_time);
+        results(N, h_C, C0, run_time);
 
     } // end for loop
 
@@ -478,7 +480,7 @@ int main(int argc, char *argv[])
             0, NULL, NULL);
         checkError(err, "Reading back d_c");
 
-        results(N, h_C, run_time);
+        results(N, h_C, C0, run_time);
 
     } // end for loop
 
@@ -489,6 +491,7 @@ int main(int argc, char *argv[])
     free(h_A);
     free(h_B);
     free(h_C);
+    free(C0);
     clReleaseMemObject(d_a);
     clReleaseMemObject(d_b);
     clReleaseMemObject(d_c);
diff --git a/Solutions/Exercise08/C/matrix_lib.c b/Solutions/Exercise08/C/matrix_lib.c
index e77587f..92a3949 100644
--- a/Solutions/Exercise08/C/matrix_lib.c
+++ b/Solutions/Exercise08/C/matrix_lib.c
@@ -47,21 +47,23 @@ void seq_mat_mul_sdot(int N, float *A, float *B, float *C)
 //------------------------------------------------------------------------------
 void initmat(int N, float *A, float *B, float *C)
 {
-    int i, j;
+  int i, j;
 
-    /* Initialize matrices */
+  /* Initialize matrices */
 
-	for (i = 0; i < N; i++)
-		for (j = 0; j < N; j++)
-			A[i*N+j] = AVAL;
+  int vv = 1;
 
-	for (i = 0; i < N; i++)
-		for (j = 0; j < N; j++)
-			B[i*N+j] = BVAL;
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      A[i*N+j] = (float) ((vv++) % 17);
 
-	for (i = 0; i < N; i++)
-		for (j = 0; j < N; j++)
-			C[i*N+j] = 0.0f;
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      B[i*N+j] = (float) ((vv++) % 11);
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      C[i*N+j] = (float) ((vv++) % 19);
 }
 
 //------------------------------------------------------------------------------
@@ -97,20 +99,19 @@ void trans(int N, float *B, float *Btrans)
 //  Function to compute errors of the product matrix
 //
 //------------------------------------------------------------------------------
-float error(int N, float *C)
+float error(int N, float *C1, float *C2)
 {
-   int i,j;
-   float cval, errsq, err;
-   cval = (float) N * AVAL * BVAL;
-   errsq = 0.0f;
-
-    for (i = 0; i < N; i++) {
-        for (j = 0; j < N; j++) {
-            err = C[i*N+j] - cval;
-            errsq += err * err;
-        }
+  int i,j;
+  float cval, errsq, err;
+  cval  = (float) N * AVAL * BVAL;
+  errsq = 0.0f;
+  for   (i = 0; i < N; i++) {
+    for (j = 0; j < N; j++) {
+      err = C1[i*N+j] - C2[i*N+j];
+      errsq += err * err;
     }
-    return errsq;
+  }
+  return errsq;
 }
 
 //------------------------------------------------------------------------------
@@ -118,14 +119,14 @@ float error(int N, float *C)
 //  Function to analyze and output results
 //
 //------------------------------------------------------------------------------
-void results(int N, float *C, double run_time)
+void results(int N, float *C1, float *C2, double run_time)
 {
     float mflops;
     float errsq;
 
     mflops = 2.0 * N * N * N/(1000000.0f * run_time);
     printf(" %.2f seconds at %.1f MFLOPS \n",  run_time,mflops);
-    errsq = error(N, C);
+    errsq = error(N, C1, C2);
     if (isnan(errsq) || errsq > TOL) {
         printf("\n Errors in multiplication: %f\n",errsq);
         exit(1);
diff --git a/Solutions/Exercise08/C/matrix_lib.h b/Solutions/Exercise08/C/matrix_lib.h
index 197321a..99057b1 100644
--- a/Solutions/Exercise08/C/matrix_lib.h
+++ b/Solutions/Exercise08/C/matrix_lib.h
@@ -46,7 +46,7 @@ void trans(int N, float *B, float *Btrans);
 //  Function to compute errors of the product matrix
 //
 //------------------------------------------------------------------------------
-float error(int N, float *C);
+float error(int N, float *C1, float *C2);
 
 
 //------------------------------------------------------------------------------
@@ -54,6 +54,6 @@ float error(int N, float *C);
 //  Function to analyze and output results 
 //
 //------------------------------------------------------------------------------
-void results(int N, float *C, double run_time);
+void results(int N, float *C1, float *C2, double run_time);
     
 #endif
diff --git a/Solutions/Exercise08/Cpp/matrix_lib.cpp b/Solutions/Exercise08/Cpp/matrix_lib.cpp
index f621b25..233c42c 100644
--- a/Solutions/Exercise08/Cpp/matrix_lib.cpp
+++ b/Solutions/Exercise08/Cpp/matrix_lib.cpp
@@ -58,7 +58,7 @@ void initmat(int N, std::vector<float>& A, std::vector<float>& B, std::vector<fl
 
   /* Initialize matrices */
 
-  int vv = 1.0;
+  int vv = 1;
 
   for (i = 0; i < N; i++)
     for (j = 0; j < N; j++)
diff --git a/Solutions/Exercise08/Python/helper.py b/Solutions/Exercise08/Python/helper.py
index a5c2f2d..5ca7292 100644
--- a/Solutions/Exercise08/Python/helper.py
+++ b/Solutions/Exercise08/Python/helper.py
@@ -1,8 +1,9 @@
 
 from definitions import *
+import numpy
 
 #  Function to compute the matrix product (sequential algorithm, dot prod)
-def seq_mat_mul_sdot(N, A, B, C):
+def seq_mat_mul_sdot_old (N, A, B, C):
     for i in range(N):
         for j in range(N):
             tmp = 0.0
@@ -10,21 +11,32 @@ def seq_mat_mul_sdot(N, A, B, C):
                 tmp += A[i*N+k] * B[k*N+j]
             C[i*N+j] = tmp
 
+def seq_mat_mul_sdot (N, A, B, C):
+    a2 = A.reshape([N,N]);
+    b2 = B.reshape([N,N]);
+    c2 = numpy.matmul(a2,b2);
+    C  = c2.reshape(N*N);
+
 #  Function to compute errors of the product matrix
-def error(N, C):
-   cval = float(N) * AVAL * BVAL
-   errsq = 0.0
-   for i in range(N):
-       for j in range(N):
-            err = C[i*N+j] - cval
-            errsq += err * err
-   return errsq;
+def error(N, C1, C2):
+    d1 = C1 - C2;
+    d2 = (d1 * d1).sum()
+    return d2;
 
 
 # Function to analyze and output results
-def results(N, C, run_time):
+def results(N, C1, C2, run_time):
     mflops = 2.0 * N * N * N/(1000000.0* run_time)
     print run_time, "seconds at", mflops, "MFLOPS"
-    errsq = error(N, C)
+    errsq = error(N, C1, C2)
     if (errsq > TOL):
         print "Errors in multiplication:", errsq
+
+## Return 1D array of length N*N initialized to non-constant small values.
+## Second variable 'm' is used to allow different initializations for different m's.
+## Current implementation simply initializes matrices to sequential values modulo 'm',
+##   but one can come up with a more randomized scheme, like using 'm' for seeding
+##   random number generator.
+
+def initMat (N, m):
+    return numpy.arange(N*N, dtype=numpy.float32) % m;
diff --git a/Solutions/Exercise08/Python/matmul.py b/Solutions/Exercise08/Python/matmul.py
index 3a9e745..75c0edb 100644
--- a/Solutions/Exercise08/Python/matmul.py
+++ b/Solutions/Exercise08/Python/matmul.py
@@ -30,17 +30,20 @@
 size = N * N
 
 
+
 # A matrix
-h_A = numpy.empty(size).astype(numpy.float32)
-h_A.fill(AVAL)
+h_A = initMat (N, 7)
 
 # B matrix
-h_B = numpy.empty(size).astype(numpy.float32)
-h_B.fill(BVAL)
+h_B = initMat (N, 11)
 
 # C matrix
 h_C = numpy.empty(size).astype(numpy.float32)
 
+## C0 is the product of h_A and h_B for later comparisons:
+
+C0 = numpy.matmul(h_A.reshape([N,N]), h_B.reshape([N,N])).reshape(N*N);
+
 print "\n===== Sequential, matrix mult (dot prod), order", ORDER, "on host CPU ======\n"
 
 for i in range(COUNT):
@@ -59,11 +62,10 @@
 queue = cl.CommandQueue(context)
 
 # Reset host buffers - just to play it safe
-h_A = numpy.empty(size).astype(numpy.float32)
-h_A.fill(AVAL)
-h_B = numpy.empty(size).astype(numpy.float32)
-h_B.fill(BVAL)
+h_A = initMat (N,  7)
+h_B = initMat (N, 11);
 h_C = numpy.empty(size).astype(numpy.float32)
+C0  = numpy.matmul(h_A.reshape([N,N]), h_B.reshape([N,N])).reshape(N*N);
 
 
 #--------------------------------------------------------------------------------
@@ -92,7 +94,7 @@
     run_time = time() - start_time
 
     cl.enqueue_copy(queue, h_C, d_c)
-    results(N, h_C, run_time)
+    results(N, h_C, C0, run_time)
 
 #--------------------------------------------------------------------------------
 # OpenCL matrix multiplication ... C row per work item
@@ -114,7 +116,7 @@
     run_time = time() - start_time
 
     cl.enqueue_copy(queue, h_C, d_c)
-    results(N, h_C, run_time)
+    results(N, h_C, C0, run_time)
 
 #--------------------------------------------------------------------------------
 # OpenCL matrix multiplication ... C row per work item, A row in pivate memory
@@ -136,7 +138,7 @@
     run_time = time() - start_time
 
     cl.enqueue_copy(queue, h_C, d_c)
-    results(N, h_C, run_time)
+    results(N, h_C, C0, run_time)
 
 #--------------------------------------------------------------------------------
 # OpenCL matrix multiplication ... C row per work item, A row pivate, B col local
@@ -160,7 +162,7 @@
     run_time = time() - start_time
 
     cl.enqueue_copy(queue, h_C, d_c)
-    results(N, h_C, run_time)
+    results(N, h_C, C0, run_time)
 
 #--------------------------------------------------------------------------------
 # OpenCL matrix multiplication ... blocked
@@ -190,5 +192,4 @@
     run_time = time() - start_time
 
     cl.enqueue_copy(queue, h_C, d_c)
-    results(N, h_C, run_time)
-
+    results(N, h_C, C0, run_time)