From 07e1adc110dc6d6adb03b2f7118928df754a34b4 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 10 Jul 2013 21:37:18 +0200
Subject: [PATCH 01/12] Basic argparse-based CLI parsing

---
 run_benchmarks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/run_benchmarks.py b/run_benchmarks.py
index 4163fbe..d079a97 100644
--- a/run_benchmarks.py
+++ b/run_benchmarks.py
@@ -7,6 +7,7 @@
     from collections import OrderedDict
 except:
     from ordereddict import OrderedDict
+import argparse
 import json
 import os
 import traceback

From 321fec2d2589b5ef16c03bb5412f818eafde2614 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 13 Aug 2013 10:07:04 -0400
Subject: [PATCH 02/12] init gemm benchmark dir

---
 gemm/__init__.py    | 14 ++++++++++++++
 gemm/gemm_python.py | 46 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 gemm/__init__.py
 create mode 100644 gemm/gemm_python.py

diff --git a/gemm/__init__.py b/gemm/__init__.py
new file mode 100644
index 0000000..000ffa7
--- /dev/null
+++ b/gemm/__init__.py
@@ -0,0 +1,14 @@
+# Authors: James Bergstra
+# License: MIT
+
+import numpy as np
+
+
+def make_env(shape=(256, 256, 256), seed=0, dtype=np.float):
+    rng = np.random.RandomState(seed)
+    A = np.asarray(rng.normal(size=(shape[0], shape[1])), dtype=dtype)
+    B = np.asarray(rng.normal(size=(shape[1], shape[2])), dtype=dtype)
+    C = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype)
+    alpha = 1.5
+    beta = 0.3
+    return (alpha, A, B, beta, C), {}
diff --git a/gemm/gemm_python.py b/gemm/gemm_python.py
new file mode 100644
index 0000000..021e4ee
--- /dev/null
+++ b/gemm/gemm_python.py
@@ -0,0 +1,46 @@
+# Authors: James Bergstra
+# License: MIT
+
+import numpy as np
+
+
+def gemm_python_nested_for_loops(alpha, A, B, beta, C):
+
+    M, N = C.shape
+    K = A.shape[1]
+    #"omp parallel for private(j, d, k, tmp)"
+    for i in range(M):
+        for j in range(N):
+            d = 0.0
+            for k in range(K):
+                tmp = A[i, k] * B[j, k]
+                d += tmp
+            C[i, j] = alpha * d + beta * C[i, j]
+    return C
+
+
+def gemm_python_inner_numpy(alpha, A, B, beta, C):
+    M, N = C.shape
+    for i in xrange(M):
+        for j in xrange(N):
+            C[i, j] *= beta
+            C[i, j] += alpha * np.dot(A[i, :], B[:, j])
+    return C
+
+
+def gemm_python_broadcast_numpy(alpha, A, B, beta, C):
+    return alpha * (A[:, None, :] * B.T[None, :, :]).sum(axis=2) + beta * C
+
+
+def gemm_python_numpy_dot(alpha, A, B, beta, C):
+    C *= beta
+    C += alpha * np.dot(A, B)
+    return C
+
+
+benchmarks = (
+    gemm_python_nested_for_loops,
+    gemm_python_inner_numpy,
+    gemm_python_broadcast_numpy,
+    gemm_python_numpy_dot,
+)

From 182d8786e1f8f451ec78e76e5bd5c7ae10bb9abd Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Tue, 13 Aug 2013 18:06:07 -0400
Subject: [PATCH 03/12] init gemm pyopencl

---
 gemm/gemm_pyopencl.py | 295 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 295 insertions(+)
 create mode 100644 gemm/gemm_pyopencl.py

diff --git a/gemm/gemm_pyopencl.py b/gemm/gemm_pyopencl.py
new file mode 100644
index 0000000..53bf0ff
--- /dev/null
+++ b/gemm/gemm_pyopencl.py
@@ -0,0 +1,295 @@
+# Authors: James Bergstra
+# License: MIT
+
+import sys
+import time
+
+from mako.template import Template
+import numpy as np
+import pyopencl as cl
+
+mf = cl.mem_flags
+
+PROFILING = 1
+
+ctx = cl.create_some_context()
+if PROFILING:
+    queue = cl.CommandQueue(
+        ctx,
+        properties=cl.command_queue_properties.PROFILING_ENABLE)
+else:
+    queue = cl.CommandQueue(ctx)
+
+
+def ctype_from_dtype(dtype):
+    return {
+            'float32': 'float',
+            'float64': 'double',
+            }[str(dtype)]
+
+
+def elemstrides(strides, dtype, vec=1, vecdim=-1):
+    size = {
+        'float32': 4,
+        'float64': 8,
+    }[str(dtype)]
+    # TODO: raise StrideError
+    for stride in strides:
+        assert stride % size == 0
+    val_strides = tuple(int(s / size) for s in strides)
+    if vec == 1:
+        return val_strides
+    vecdim = range(len(strides))[vecdim]   # -- make vecdim non-neg
+    for ii, val_stride in enumerate(val_strides):
+        if ii == vecdim:
+            assert val_stride == 1
+        else:
+            assert val_stride % vec == 0
+    vec_strides = [int(s / vec) for s in val_strides]
+    vec_strides[vecdim] = 1
+    return tuple(vec_strides)
+
+
+
+def memoize(f):
+    cache = {}
+    def new_fn(*args, **kwargs):
+        key = args + tuple(sorted(kwargs.items()))
+        try:
+            return cache[key]
+        except KeyError:
+            rval = f(*args, **kwargs)
+            cache[key] = rval
+            return rval
+    new_fn.__name__ = f.__name__
+    new_fn.memoize_cache = cache
+    return new_fn
+
+
+@memoize
+def gemm_cpu_prepare_reference(alpha, beta, M, N, K, dtype,
+                               Astrides, Bstrides, Cstrides):
+    ctype = ctype_from_dtype(dtype)
+    (As0, As1) = elemstrides(Astrides, dtype)
+    (Bs0, Bs1) = elemstrides(Bstrides, dtype)
+    (Cs0, Cs1) = elemstrides(Cstrides, dtype)
+    prg = cl.Program(ctx, """
+        __kernel void ref(__global %(ctype)s *A,
+                          __global %(ctype)s *B,
+                          __global %(ctype)s *C)
+        {
+          for(int mm = get_global_id(0); mm < %(M)s; mm += get_global_size(0))
+          {
+            for(int nn = get_global_id(1); nn < %(N)s; nn += get_global_size(1))
+            {
+              %(ctype)s buf = 0;
+              for (int kk = 0; kk < %(K)s; ++kk)
+              {
+                  buf += A[mm * %(As0)s + kk * %(As1)s]
+                       * B[kk * %(Bs0)s + nn * %(Bs1)s];
+              }
+              C[mm * %(Cs0)s + nn * %(Cs1)s] *= %(beta)s;
+              C[mm * %(Cs0)s + nn * %(Cs1)s] += %(alpha)s * buf;
+            }
+          }
+        }
+        """ % locals()).build()
+
+    return prg.ref
+
+class StrideError(Exception):
+    """StrideError"""
+
+class BlockingError(Exception):
+    """BlockingError"""
+
+vectorized_text =  """
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+__kernel void kern(__global ${ctype}${KB} *A,
+                  __global ${ctype}${NB} *B,
+                  __global ${ctype}${NB} *C)
+{
+  const ${ctype}${NB} beta = (${ctype}${NB})(
+      ${beta}
+  % for ii in range(NB - 1):
+      , ${beta}
+  % endfor
+  );
+  ${ctype}${NB} tmp;
+  % for ii in range(MB):
+  ${ctype}${KB} Abuf${ii};
+  ${ctype}${NB} Cbuf${ii};
+  % endfor
+  ${ctype}${NB} Bbuf;
+  for(int mb = get_global_id(0); mb < ${NoMB}; mb += get_global_size(0))
+  {
+    for(int nb = get_global_id(1); nb < ${NoNB}; nb += get_global_size(1))
+    {
+      % for ii in range(MB):
+      Cbuf${ii} = (${ctype}${NB})(
+                    0
+                    % for foo in range(NB - 1):
+                    , 0
+                    % endfor
+                    );
+      % endfor
+
+      for (int kb = 0; kb < ${NoKB}; ++kb)
+      {
+        // load KB columns of A at a time
+        % for ii in range(MB):
+        Abuf${ii} = A[${As0} * (mb * ${MB} + ${ii}) + kb];
+        % endfor
+
+        % for kki in range(KB):
+        Bbuf = B[(kb * ${KB} + ${kki}) * ${Bs0} + nb];
+
+            % for ii in range(MB):
+
+            tmp = (${ctype}${NB})(
+                    Abuf${ii}.s${kki}
+                    % for foo in range(NB - 1):
+                    , Abuf${ii}.s${kki}
+                    % endfor
+                    );
+            Cbuf${ii} = mad(tmp, Bbuf, Cbuf${ii});
+            % endfor
+
+        % endfor
+      }
+
+      % if alpha != 1:
+          % for ii in range(MB):
+              Cbuf${ii} *= (${ctype}${NB})(
+                    ${alpha}
+                    % for foo in range(NB - 1):
+                    , ${alpha}
+                    % endfor
+                    );
+          % endfor
+      % endif
+
+      % for ii in range(MB):
+          % if beta == 0:
+              C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] = Cbuf${ii};
+          % elif beta == 1:
+              C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] += Cbuf${ii};
+          % else:
+              C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] = mad(beta, C[(mb* ${MB} + ${ii}) * ${Cs0} + nb], Cbuf${ii});
+          % endif
+      % endfor
+    }
+  }
+}
+    """
+
+
+@memoize
+def gemm_cpu_prepare_vectorized(alpha, beta, M, N, K, dtype,
+                               Astrides, Bstrides, Cstrides, MB, NB, KB):
+    ctype = ctype_from_dtype(dtype)
+    (As0, As1) = elemstrides(Astrides, dtype, MB)
+    (Bs0, Bs1) = elemstrides(Bstrides, dtype, NB)
+    (Cs0, Cs1) = elemstrides(Cstrides, dtype, MB)
+    if MB: assert As1 == 1
+    if NB: assert Bs1 == 1
+    if MB: assert Cs1 == 1
+    NoMB = M // MB
+    NoNB = N // NB
+    NoKB = K // KB
+    if M != MB * NoMB:
+        raise BlockingError()
+    if N != NB * NoNB:
+        raise BlockingError()
+    if K != KB * NoKB:
+        raise BlockingError()
+    text = Template(vectorized_text, output_encoding='ascii').render(**locals())
+    for ii, line in enumerate(text.split('\n')):
+        print ii, line
+    prg = cl.Program(ctx, text).build()
+    print 'built!'
+
+    return prg.kern
+
+
+comptimes = []
+def gemm_pyopencl_cpu(alpha, A, B, beta, C):
+    kern = None
+    global_shape = (4, 4)   # enough for different cores
+    local_shape = (1, 1)    # I think this does nothing on CPU (true?)
+
+    # TODO: some values of these constants that should work, do not.
+    # e.g. 16, 8, 8 -> wrong answer
+    #      4, 4, 8  -> segfault
+    for MB in [8]:
+        for NB in [8]:
+            for KB in [8]:
+                if kern:
+                    continue
+                try:
+                    kern = gemm_cpu_prepare_vectorized(
+                        alpha, beta,
+                        C.shape[0], C.shape[1], A.shape[1],
+                        A.dtype,
+                        A.strides, B.strides, C.strides,
+                        MB=MB, NB=NB, KB=KB)
+                    print 'Using kernel for MB=%i NB=%i KB=%i' % (MB, NB, KB)
+                except StrideError:
+                    pass
+    if kern is None:
+        kern = gemm_cpu_prepare_reference(alpha, beta,
+                                          C.shape[0], C.shape[1], A.shape[1],
+                                          A.dtype,
+                                          A.strides, B.strides, C.strides)
+
+    A_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=A)
+    B_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=B)
+    C_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=C)
+    ev = kern(queue, global_shape, local_shape, A_buf, B_buf, C_buf)
+    cl.enqueue_copy(queue, C, C_buf)
+    queue.finish()
+    if PROFILING:
+        comptimes.append(1e-9 * (ev.profile.end - ev.profile.start))
+        print 'computation time', min(comptimes)
+
+
+benchmarks = (
+    gemm_pyopencl_cpu,
+)
+
+NNN = 1024 #* 2
+def main(shape=(NNN, NNN, NNN), seed=0, dtype=np.float64):
+    rng = np.random.RandomState(seed)
+    A = np.asarray(rng.normal(size=(shape[0], shape[1])), dtype=dtype)
+    B = np.asarray(rng.normal(size=(shape[1], shape[2])), dtype=dtype)
+    C = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype)
+    C2 = np.empty_like(C)
+    Z = np.ones(10 * 1024 * 1024)
+    alpha = 1.0
+    beta = 0.0
+    FLOPS = shape[0] * shape[1] * (shape[2] + 1) * 2
+    for i in range(500):
+        rng.seed(i)
+        C[:] = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype)
+
+        t0 = time.time()
+        gemm_pyopencl_cpu(alpha, A, B, beta, C)
+        t1 = time.time()
+        print 'pyopencl time: ', (t1 - t0), 'GFlops', FLOPS / (t1 - t0) / (1000 ** 3)
+
+        rng.seed(i)
+        C2[:] = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype)
+
+        t0 = time.time()
+        C3 = alpha * np.dot( A, B) + beta * C2
+        t1 = time.time()
+        print 'np.dot time:   ', (t1 - t0), 'GFlops', FLOPS / (t1 - t0) / (1000 ** 3)
+
+        print np.max(abs(C - C3))
+        assert np.allclose(C, C3)
+        # -- clear processor cache
+        Z.sum()
+        time.sleep(1)
+
+if __name__ == '__main__':
+    sys.exit(main())

From 16805c6b7a077606ce9033ee757adb303afbaccb Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Wed, 14 Aug 2013 09:49:37 -0400
Subject: [PATCH 04/12] gemm_pyopencl -> BSD-3

---
 gemm/gemm_pyopencl.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gemm/gemm_pyopencl.py b/gemm/gemm_pyopencl.py
index 53bf0ff..d0ce9fd 100644
--- a/gemm/gemm_pyopencl.py
+++ b/gemm/gemm_pyopencl.py
@@ -1,5 +1,5 @@
 # Authors: James Bergstra
-# License: MIT
+# License: BSD-3
 
 import sys
 import time
@@ -10,7 +10,7 @@
 
 mf = cl.mem_flags
 
-PROFILING = 1
+PROFILING = 0
 
 ctx = cl.create_some_context()
 if PROFILING:
@@ -257,8 +257,8 @@ def gemm_pyopencl_cpu(alpha, A, B, beta, C):
     gemm_pyopencl_cpu,
 )
 
-NNN = 1024 #* 2
-def main(shape=(NNN, NNN, NNN), seed=0, dtype=np.float64):
+NNN = 1024 * 2
+def main(shape=(NNN, NNN, NNN), seed=0, dtype=np.float32):
     rng = np.random.RandomState(seed)
     A = np.asarray(rng.normal(size=(shape[0], shape[1])), dtype=dtype)
     B = np.asarray(rng.normal(size=(shape[1], shape[2])), dtype=dtype)
@@ -276,6 +276,7 @@ def main(shape=(NNN, NNN, NNN), seed=0, dtype=np.float64):
         gemm_pyopencl_cpu(alpha, A, B, beta, C)
         t1 = time.time()
         print 'pyopencl time: ', (t1 - t0), 'GFlops', FLOPS / (t1 - t0) / (1000 ** 3)
+        continue
 
         rng.seed(i)
         C2[:] = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype)

From f01dbb16f02d637dad45a0296583fe98366cf5ec Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Wed, 14 Aug 2013 11:46:08 -0400
Subject: [PATCH 05/12] several gemm implementations

---
 gemm/__init__.py      |  13 +-
 gemm/gemm_cython.pyx  |  32 +++++
 gemm/gemm_numba.py    |  11 ++
 gemm/gemm_parakeet.py |  13 ++
 gemm/gemm_pyopencl.py | 295 +-----------------------------------------
 gemm/gemm_python.py   |   3 +-
 gemm/gemm_theano.py   |  45 +++++++
 7 files changed, 113 insertions(+), 299 deletions(-)
 create mode 100644 gemm/gemm_cython.pyx
 create mode 100644 gemm/gemm_numba.py
 create mode 100644 gemm/gemm_parakeet.py
 create mode 100644 gemm/gemm_theano.py

diff --git a/gemm/__init__.py b/gemm/__init__.py
index 000ffa7..7ee8d1a 100644
--- a/gemm/__init__.py
+++ b/gemm/__init__.py
@@ -4,11 +4,12 @@
 import numpy as np
 
 
-def make_env(shape=(256, 256, 256), seed=0, dtype=np.float):
+def make_env(M=512, N=512, K=512, seed=0, dtype=np.float64,
+        alpha=1.5,
+        beta=0.5):
     rng = np.random.RandomState(seed)
-    A = np.asarray(rng.normal(size=(shape[0], shape[1])), dtype=dtype)
-    B = np.asarray(rng.normal(size=(shape[1], shape[2])), dtype=dtype)
-    C = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype)
-    alpha = 1.5
-    beta = 0.3
+    A = np.asarray(rng.normal(size=(M, K)), dtype=dtype)
+    B = np.asarray(rng.normal(size=(K, N)), dtype=dtype)
+    C = np.asarray(rng.normal(size=(M, N)), dtype=dtype)
     return (alpha, A, B, beta, C), {}
+
diff --git a/gemm/gemm_cython.pyx b/gemm/gemm_cython.pyx
new file mode 100644
index 0000000..2f597be
--- /dev/null
+++ b/gemm/gemm_cython.pyx
@@ -0,0 +1,32 @@
+
+# Authors: Jake Vanderplas, Olivier Grisel
+# License: MIT
+
+import numpy as np
+cimport cython
+from libc.math cimport sqrt
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def gemm_cython_for_loops(
+        double alpha,
+        double[:, ::1] A,
+        double[:, ::1] B,
+        double beta,
+        double[:, ::1] C,
+        ):
+    cdef int M = C.shape[0]
+    cdef int N = C.shape[1]
+    cdef int K = A.shape[1]
+    cdef double tmp, d
+    for i in range(M):
+        for j in range(N):
+            d = 0.0
+            for k in range(K):
+                d += A[i, k] * B[k, j]
+            C[i, j] = alpha * d + beta * C[i, j]
+
+
+benchmarks = (
+    gemm_cython_for_loops,
+)
diff --git a/gemm/gemm_numba.py b/gemm/gemm_numba.py
new file mode 100644
index 0000000..702ca38
--- /dev/null
+++ b/gemm/gemm_numba.py
@@ -0,0 +1,11 @@
+# Authors: Olivier Grisel
+# License: MIT
+
+from gemm import gemm_python
+from numba import autojit
+
+
+benchmarks = (
+    ("gemm_numba_nested_for_loops",
+     autojit(gemm_python.gemm_python_nested_for_loops)),
+)
diff --git a/gemm/gemm_parakeet.py b/gemm/gemm_parakeet.py
new file mode 100644
index 0000000..8c189d1
--- /dev/null
+++ b/gemm/gemm_parakeet.py
@@ -0,0 +1,13 @@
+# Authors: Olivier Grisel
+# License: MIT
+
+from gemm import gemm_python
+from parakeet import jit
+
+
+benchmarks = (
+    ("gemm_parakeet_nested_for_loops",
+     jit(gemm_python.gemm_python_nested_for_loops)),
+    ("gemm_parakeet_inner_numpy",
+     jit(gemm_python.gemm_python_inner_numpy)),
+)
diff --git a/gemm/gemm_pyopencl.py b/gemm/gemm_pyopencl.py
index d0ce9fd..a1a9eaf 100644
--- a/gemm/gemm_pyopencl.py
+++ b/gemm/gemm_pyopencl.py
@@ -1,296 +1,9 @@
-# Authors: James Bergstra
-# License: BSD-3
-
-import sys
-import time
-
-from mako.template import Template
-import numpy as np
-import pyopencl as cl
-
-mf = cl.mem_flags
-
-PROFILING = 0
-
-ctx = cl.create_some_context()
-if PROFILING:
-    queue = cl.CommandQueue(
-        ctx,
-        properties=cl.command_queue_properties.PROFILING_ENABLE)
-else:
-    queue = cl.CommandQueue(ctx)
-
-
-def ctype_from_dtype(dtype):
-    return {
-            'float32': 'float',
-            'float64': 'double',
-            }[str(dtype)]
-
-
-def elemstrides(strides, dtype, vec=1, vecdim=-1):
-    size = {
-        'float32': 4,
-        'float64': 8,
-    }[str(dtype)]
-    # TODO: raise StrideError
-    for stride in strides:
-        assert stride % size == 0
-    val_strides = tuple(int(s / size) for s in strides)
-    if vec == 1:
-        return val_strides
-    vecdim = range(len(strides))[vecdim]   # -- make vecdim non-neg
-    for ii, val_stride in enumerate(val_strides):
-        if ii == vecdim:
-            assert val_stride == 1
-        else:
-            assert val_stride % vec == 0
-    vec_strides = [int(s / vec) for s in val_strides]
-    vec_strides[vecdim] = 1
-    return tuple(vec_strides)
-
-
-
-def memoize(f):
-    cache = {}
-    def new_fn(*args, **kwargs):
-        key = args + tuple(sorted(kwargs.items()))
-        try:
-            return cache[key]
-        except KeyError:
-            rval = f(*args, **kwargs)
-            cache[key] = rval
-            return rval
-    new_fn.__name__ = f.__name__
-    new_fn.memoize_cache = cache
-    return new_fn
-
-
-@memoize
-def gemm_cpu_prepare_reference(alpha, beta, M, N, K, dtype,
-                               Astrides, Bstrides, Cstrides):
-    ctype = ctype_from_dtype(dtype)
-    (As0, As1) = elemstrides(Astrides, dtype)
-    (Bs0, Bs1) = elemstrides(Bstrides, dtype)
-    (Cs0, Cs1) = elemstrides(Cstrides, dtype)
-    prg = cl.Program(ctx, """
-        __kernel void ref(__global %(ctype)s *A,
-                          __global %(ctype)s *B,
-                          __global %(ctype)s *C)
-        {
-          for(int mm = get_global_id(0); mm < %(M)s; mm += get_global_size(0))
-          {
-            for(int nn = get_global_id(1); nn < %(N)s; nn += get_global_size(1))
-            {
-              %(ctype)s buf = 0;
-              for (int kk = 0; kk < %(K)s; ++kk)
-              {
-                  buf += A[mm * %(As0)s + kk * %(As1)s]
-                       * B[kk * %(Bs0)s + nn * %(Bs1)s];
-              }
-              C[mm * %(Cs0)s + nn * %(Cs1)s] *= %(beta)s;
-              C[mm * %(Cs0)s + nn * %(Cs1)s] += %(alpha)s * buf;
-            }
-          }
-        }
-        """ % locals()).build()
-
-    return prg.ref
-
-class StrideError(Exception):
-    """StrideError"""
-
-class BlockingError(Exception):
-    """BlockingError"""
-
-vectorized_text =  """
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-__kernel void kern(__global ${ctype}${KB} *A,
-                  __global ${ctype}${NB} *B,
-                  __global ${ctype}${NB} *C)
-{
-  const ${ctype}${NB} beta = (${ctype}${NB})(
-      ${beta}
-  % for ii in range(NB - 1):
-      , ${beta}
-  % endfor
-  );
-  ${ctype}${NB} tmp;
-  % for ii in range(MB):
-  ${ctype}${KB} Abuf${ii};
-  ${ctype}${NB} Cbuf${ii};
-  % endfor
-  ${ctype}${NB} Bbuf;
-  for(int mb = get_global_id(0); mb < ${NoMB}; mb += get_global_size(0))
-  {
-    for(int nb = get_global_id(1); nb < ${NoNB}; nb += get_global_size(1))
-    {
-      % for ii in range(MB):
-      Cbuf${ii} = (${ctype}${NB})(
-                    0
-                    % for foo in range(NB - 1):
-                    , 0
-                    % endfor
-                    );
-      % endfor
-
-      for (int kb = 0; kb < ${NoKB}; ++kb)
-      {
-        // load KB columns of A at a time
-        % for ii in range(MB):
-        Abuf${ii} = A[${As0} * (mb * ${MB} + ${ii}) + kb];
-        % endfor
-
-        % for kki in range(KB):
-        Bbuf = B[(kb * ${KB} + ${kki}) * ${Bs0} + nb];
-
-            % for ii in range(MB):
-
-            tmp = (${ctype}${NB})(
-                    Abuf${ii}.s${kki}
-                    % for foo in range(NB - 1):
-                    , Abuf${ii}.s${kki}
-                    % endfor
-                    );
-            Cbuf${ii} = mad(tmp, Bbuf, Cbuf${ii});
-            % endfor
-
-        % endfor
-      }
-
-      % if alpha != 1:
-          % for ii in range(MB):
-              Cbuf${ii} *= (${ctype}${NB})(
-                    ${alpha}
-                    % for foo in range(NB - 1):
-                    , ${alpha}
-                    % endfor
-                    );
-          % endfor
-      % endif
-
-      % for ii in range(MB):
-          % if beta == 0:
-              C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] = Cbuf${ii};
-          % elif beta == 1:
-              C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] += Cbuf${ii};
-          % else:
-              C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] = mad(beta, C[(mb* ${MB} + ${ii}) * ${Cs0} + nb], Cbuf${ii});
-          % endif
-      % endfor
-    }
-  }
-}
-    """
-
-
-@memoize
-def gemm_cpu_prepare_vectorized(alpha, beta, M, N, K, dtype,
-                               Astrides, Bstrides, Cstrides, MB, NB, KB):
-    ctype = ctype_from_dtype(dtype)
-    (As0, As1) = elemstrides(Astrides, dtype, MB)
-    (Bs0, Bs1) = elemstrides(Bstrides, dtype, NB)
-    (Cs0, Cs1) = elemstrides(Cstrides, dtype, MB)
-    if MB: assert As1 == 1
-    if NB: assert Bs1 == 1
-    if MB: assert Cs1 == 1
-    NoMB = M // MB
-    NoNB = N // NB
-    NoKB = K // KB
-    if M != MB * NoMB:
-        raise BlockingError()
-    if N != NB * NoNB:
-        raise BlockingError()
-    if K != KB * NoKB:
-        raise BlockingError()
-    text = Template(vectorized_text, output_encoding='ascii').render(**locals())
-    for ii, line in enumerate(text.split('\n')):
-        print ii, line
-    prg = cl.Program(ctx, text).build()
-    print 'built!'
-
-    return prg.kern
-
-
-comptimes = []
-def gemm_pyopencl_cpu(alpha, A, B, beta, C):
-    kern = None
-    global_shape = (4, 4)   # enough for different cores
-    local_shape = (1, 1)    # I think this does nothing on CPU (true?)
-
-    # TODO: some values of these constants that should work, do not.
-    # e.g. 16, 8, 8 -> wrong answer
-    #      4, 4, 8  -> segfault
-    for MB in [8]:
-        for NB in [8]:
-            for KB in [8]:
-                if kern:
-                    continue
-                try:
-                    kern = gemm_cpu_prepare_vectorized(
-                        alpha, beta,
-                        C.shape[0], C.shape[1], A.shape[1],
-                        A.dtype,
-                        A.strides, B.strides, C.strides,
-                        MB=MB, NB=NB, KB=KB)
-                    print 'Using kernel for MB=%i NB=%i KB=%i' % (MB, NB, KB)
-                except StrideError:
-                    pass
-    if kern is None:
-        kern = gemm_cpu_prepare_reference(alpha, beta,
-                                          C.shape[0], C.shape[1], A.shape[1],
-                                          A.dtype,
-                                          A.strides, B.strides, C.strides)
-
-    A_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=A)
-    B_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=B)
-    C_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=C)
-    ev = kern(queue, global_shape, local_shape, A_buf, B_buf, C_buf)
-    cl.enqueue_copy(queue, C, C_buf)
-    queue.finish()
-    if PROFILING:
-        comptimes.append(1e-9 * (ev.profile.end - ev.profile.start))
-        print 'computation time', min(comptimes)
+# Author: James Bergstra
+# License: MIT
 
+from pybench_pyopencl import gemm_pyopencl
 
 benchmarks = (
-    gemm_pyopencl_cpu,
+    gemm_pyopencl.gemm_pyopencl_cpu,
 )
 
-NNN = 1024 * 2
-def main(shape=(NNN, NNN, NNN), seed=0, dtype=np.float32):
-    rng = np.random.RandomState(seed)
-    A = np.asarray(rng.normal(size=(shape[0], shape[1])), dtype=dtype)
-    B = np.asarray(rng.normal(size=(shape[1], shape[2])), dtype=dtype)
-    C = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype)
-    C2 = np.empty_like(C)
-    Z = np.ones(10 * 1024 * 1024)
-    alpha = 1.0
-    beta = 0.0
-    FLOPS = shape[0] * shape[1] * (shape[2] + 1) * 2
-    for i in range(500):
-        rng.seed(i)
-        C[:] = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype)
-
-        t0 = time.time()
-        gemm_pyopencl_cpu(alpha, A, B, beta, C)
-        t1 = time.time()
-        print 'pyopencl time: ', (t1 - t0), 'GFlops', FLOPS / (t1 - t0) / (1000 ** 3)
-        continue
-
-        rng.seed(i)
-        C2[:] = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype)
-
-        t0 = time.time()
-        C3 = alpha * np.dot( A, B) + beta * C2
-        t1 = time.time()
-        print 'np.dot time:   ', (t1 - t0), 'GFlops', FLOPS / (t1 - t0) / (1000 ** 3)
-
-        print np.max(abs(C - C3))
-        assert np.allclose(C, C3)
-        # -- clear processor cache
-        Z.sum()
-        time.sleep(1)
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/gemm/gemm_python.py b/gemm/gemm_python.py
index 021e4ee..9a7ce52 100644
--- a/gemm/gemm_python.py
+++ b/gemm/gemm_python.py
@@ -4,8 +4,8 @@
 import numpy as np
 
 
+# -- too slow to run directly, but good for JIT by other systems
 def gemm_python_nested_for_loops(alpha, A, B, beta, C):
-
     M, N = C.shape
     K = A.shape[1]
     #"omp parallel for private(j, d, k, tmp)"
@@ -39,7 +39,6 @@ def gemm_python_numpy_dot(alpha, A, B, beta, C):
 
 
 benchmarks = (
-    gemm_python_nested_for_loops,
     gemm_python_inner_numpy,
     gemm_python_broadcast_numpy,
     gemm_python_numpy_dot,
diff --git a/gemm/gemm_theano.py b/gemm/gemm_theano.py
new file mode 100644
index 0000000..e6948bd
--- /dev/null
+++ b/gemm/gemm_theano.py
@@ -0,0 +1,45 @@
+# Authors: James Bergstra
+# License: MIT
+import theano
+import theano.tensor as TT
+
+
+def gemm_theano_tensor_prepare(dtype):
+    alpha = TT.scalar(dtype=str(dtype))
+    beta = TT.scalar(dtype=str(dtype))
+    A = TT.matrix(dtype=str(dtype))
+    B = TT.matrix(dtype=str(dtype))
+    C = TT.matrix(dtype=str(dtype))
+    Z = alpha * TT.sum(A[:, None, :] * B, axis=2) + beta * C
+    name = 'gemm_theano_broadcast_' + dtype
+    Cin = theano.In(C, mutable=True, borrow=True)
+    rval = theano.function([alpha, A, B, beta, Cin],
+                           theano.Out(Z, borrow=True),
+                           allow_input_downcast=True, name=name)
+    rval.__name__ = name
+    return rval
+
+
+def gemm_theano_blas_prepare(dtype):
+    alpha = TT.scalar(dtype=str(dtype))
+    beta = TT.scalar(dtype=str(dtype))
+    A = TT.matrix(dtype=str(dtype))
+    B = TT.matrix(dtype=str(dtype))
+    C = TT.matrix(dtype=str(dtype))
+    Z = alpha * TT.dot(A, B) + beta * C
+    Cin = theano.In(C, mutable=True, borrow=True)
+    name = 'gemm_theano_blas_' + dtype
+    rval = theano.function([alpha, A, B, beta, Cin],
+                           theano.Out(Z, borrow=True),
+                           allow_input_downcast=True, name=name)
+    rval.__name__ = name
+    return rval
+
+
+benchmarks = (
+    #gemm_theano_tensor_prepare('float32'),
+    gemm_theano_tensor_prepare('float64'),
+    #gemm_theano_blas_prepare('float32'),
+    gemm_theano_blas_prepare('float64'),
+)
+

From de77f07c061ce4947ea51d41b671a54a363619c4 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Wed, 14 Aug 2013 11:51:18 -0400
Subject: [PATCH 06/12] implementing pairwise_pyopencl like gemm_pyopencl

---
 pairwise/pairwise_pyopencl.py | 104 +++-------------------------------
 1 file changed, 7 insertions(+), 97 deletions(-)

diff --git a/pairwise/pairwise_pyopencl.py b/pairwise/pairwise_pyopencl.py
index 6e4e40d..c6a6669 100644
--- a/pairwise/pairwise_pyopencl.py
+++ b/pairwise/pairwise_pyopencl.py
@@ -1,105 +1,15 @@
-# Authors: James Bergstra
+# Author: James Bergstra
 # License: MIT
 
 import numpy as np
-import time
-import pyopencl as cl
-import numpy
+from pybench_pyopencl import pairwise_pyopencl
 
-mf = cl.mem_flags
-
-PROFILING = 0
-
-ctx = cl.create_some_context()
-if PROFILING:
-    queue = cl.CommandQueue(
-        ctx,
-        properties=cl.command_queue_properties.PROFILING_ENABLE)
-else:
-    queue = cl.CommandQueue(ctx)
-
-_cache = {}
-
-def pairwise_pyopencl_cpu_prepare(shp, dtype):
-    N, D = shp
-    ctype = {
-            'float32': 'float',
-            'float64': 'double',
-            }[str(dtype)]
-
-    odd_d = "" if 0 == D % 2 else """
-    __global %(ctype)s * a1 = (__global %(ctype)s*) (a);
-    %(ctype)s diff = a1[(n0 + 1) * %(D)s - 1] - a1[(m0 + 1) * %(D)s - 1];
-    buf.s0 += diff * diff;
-    """
-
-    prg = cl.Program(ctx, """
-        __kernel void lower(__global %(ctype)s2 *a, __global %(ctype)s *c)
-        {
-          for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0))
-          {
-              for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1))
-              {
-                if (n0 < m0) continue;
-                  __global %(ctype)s2 *an = a + n0 * %(D)s / 2;
-                  __global %(ctype)s2 *am = a + m0 * %(D)s / 2;
-                  %(ctype)s2 buf = 0;
-                  for (int d = 0; d < %(D)s/2; ++d)
-                  {
-                    %(ctype)s2 diff = am[d] - an[d];
-                    buf += diff * diff;
-                  }
-                  %(odd_d)s;
-                  c[m0 * %(N)s + n0] = sqrt(buf.s0 + buf.s1);
-              }
-          }
-        }
-        __kernel void upper(__global %(ctype)s *a, __global %(ctype)s *c)
-        {
-          for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0))
-          {
-              for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1))
-              {
-                if (n0 >= m0) continue;
-                  c[m0 * %(N)s + n0] = c[n0 * %(N)s + m0];
-              }
-          }
-        }
-        """ % locals()).build()
-
-    return prg.lower, prg.upper
-
-
-comptimes = []
 def pairwise_pyopencl_cpu(data):
-    data = np.asarray(data, order='C')
-    N, D = data.shape
-    try:
-        lower, upper = _cache[(data.shape, data.dtype)]
-    except:
-        lower, upper = pairwise_pyopencl_cpu_prepare(data.shape, data.dtype)
-        _cache[(data.shape, data.dtype)] = lower, upper
-    data_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=data)
-    dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, N * N * data.dtype.itemsize)
-    try:
-        rval, _ = cl.enqueue_map_buffer(queue, dest_buf, cl.map_flags.READ,
-                offset=0, shape=(N, N), dtype=data.dtype)
-        need_copy = False
-    except TypeError: #OSX's OCL needs this?
-        rval = np.empty((N, N), dtype=data.dtype)
-        need_copy = True
-    lower(queue, (N, 1), (1, 1), data_buf, dest_buf)
-    upper(queue, (4, 4), (1, 1), data_buf, dest_buf)
-    if need_copy:
-        cl.enqueue_copy(queue, rval, dest_buf)
-    else:
-        queue.finish()
-    if PROFILING:
-        comptimes.append(1e-9 * (ev.profile.end - ev.profile.start))
-        print 'computation time', min(comptimes)
-    return rval
-
+    M, K = data.shape
+    out = np.zeros((M, M), dtype=data.dtype)
+    return pairwise_pyopencl.pairwise_pyopencl_cpu(data, data.T, out)
 
 benchmarks = (
-    pairwise_pyopencl_cpu,
+        pairwise_pyopencl_cpu,
 )
+

From 1105d1515327b3a84a18e7c1e6468d3902c18b00 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Wed, 14 Aug 2013 13:41:02 -0400
Subject: [PATCH 07/12] comment with location of pybench_pyopencl

---
 gemm/gemm_pyopencl.py         | 1 +
 pairwise/pairwise_pyopencl.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/gemm/gemm_pyopencl.py b/gemm/gemm_pyopencl.py
index a1a9eaf..235dbe9 100644
--- a/gemm/gemm_pyopencl.py
+++ b/gemm/gemm_pyopencl.py
@@ -1,6 +1,7 @@
 # Author: James Bergstra
 # License: MIT
 
+# -- https://github.com/jaberg/python-benchmarks-pyopencl
 from pybench_pyopencl import gemm_pyopencl
 
 benchmarks = (
diff --git a/pairwise/pairwise_pyopencl.py b/pairwise/pairwise_pyopencl.py
index c6a6669..6cbce99 100644
--- a/pairwise/pairwise_pyopencl.py
+++ b/pairwise/pairwise_pyopencl.py
@@ -2,6 +2,8 @@
 # License: MIT
 
 import numpy as np
+
+# -- https://github.com/jaberg/python-benchmarks-pyopencl
 from pybench_pyopencl import pairwise_pyopencl
 
 def pairwise_pyopencl_cpu(data):

From b89f4841afc0ea40d6ab01a238e24ed085654ade Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Wed, 14 Aug 2013 13:44:15 -0400
Subject: [PATCH 08/12] pairwise: removed "cheating" gemm-based impls

---
 pairwise/pairwise_python.py |  7 -------
 pairwise/pairwise_theano.py | 16 +---------------
 2 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/pairwise/pairwise_python.py b/pairwise/pairwise_python.py
index 21698fe..c792783 100644
--- a/pairwise/pairwise_python.py
+++ b/pairwise/pairwise_python.py
@@ -31,15 +31,8 @@ def pairwise_python_broadcast_numpy(data):
     return np.sqrt(((data[:, None, :] - data) ** 2).sum(axis=2))
 
 
-def pairwise_python_numpy_dot(data):
-    X_norm_2 = (data ** 2).sum(axis=1)
-    dists = np.sqrt(2 * X_norm_2 - np.dot(data, data.T))
-    return dists
-
-
 benchmarks = (
     pairwise_python_nested_for_loops,
     pairwise_python_inner_numpy,
     pairwise_python_broadcast_numpy,
-    pairwise_python_numpy_dot,
 )
diff --git a/pairwise/pairwise_theano.py b/pairwise/pairwise_theano.py
index 367f545..a13661f 100644
--- a/pairwise/pairwise_theano.py
+++ b/pairwise/pairwise_theano.py
@@ -18,21 +18,7 @@ def pairwise_theano_tensor_prepare(dtype):
     return rval
 
 
-def pairwise_theano_blas_prepare(dtype):
-    X = TT.matrix(dtype=str(dtype))
-    X_norm_2 = (X ** 2).sum(axis=1)
-    dists = TT.sqrt(2 * X_norm_2 - TT.dot(X, X.T))
-    name = 'pairwise_theano_blas_' + dtype
-    rval = theano.function([X],
-                           theano.Out(dists, borrow=True),
-                           allow_input_downcast=True, name=name)
-    rval.__name__ = name
-    return rval
-
-
 benchmarks = (
-    pairwise_theano_tensor_prepare('float32'),
+    #pairwise_theano_tensor_prepare('float32'),
     pairwise_theano_tensor_prepare('float64'),
-    pairwise_theano_blas_prepare('float32'),
-    pairwise_theano_blas_prepare('float64'),
 )

From 8f3b24d08d080e4c5931f60da22a008e81bd8833 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Wed, 14 Aug 2013 19:05:23 -0400
Subject: [PATCH 09/12] pairwise_pyopencl init output empty instead of 0

---
 pairwise/pairwise_pyopencl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pairwise/pairwise_pyopencl.py b/pairwise/pairwise_pyopencl.py
index 6cbce99..983195c 100644
--- a/pairwise/pairwise_pyopencl.py
+++ b/pairwise/pairwise_pyopencl.py
@@ -8,7 +8,7 @@
 
 def pairwise_pyopencl_cpu(data):
     M, K = data.shape
-    out = np.zeros((M, M), dtype=data.dtype)
+    out = np.empty((M, M), dtype=data.dtype, order='C')
     return pairwise_pyopencl.pairwise_pyopencl_cpu(data, data.T, out)
 
 benchmarks = (

From 2f23d945ce6060a2169a0fb18b95c6202a2ed889 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Thu, 15 Aug 2013 10:11:32 -0400
Subject: [PATCH 10/12] Wrote gemm benchmark docstring

---
 gemm/__init__.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/gemm/__init__.py b/gemm/__init__.py
index 7ee8d1a..de0b9a7 100644
--- a/gemm/__init__.py
+++ b/gemm/__init__.py
@@ -1,6 +1,35 @@
 # Authors: James Bergstra
 # License: MIT
 
+"""Computes the GEMM routine from the BLAS standard.
+
+BLAS defines the xGEMM (DGEMM, SGEMM, CGEMM, ZGEMM) operations as
+dense matrix multiplication and accumulation as follows:
+
+C <- alpha A x B + beta C
+
+Here (alpha, beta) are scalars and (A, B, C) are matrices.
+
+This operation is one of the most widely-used and most-studied kernels in
+high-performance computing, and BLAS implementations such as OpenBLAS, ATLAS,
+and MKL provide highly optimized implementations of this operation. GEMM
+implementations provide a real-life measure of peak performance on a
+particular platform.
+
+Note that the GEMM interface does not actually describe an algorithm, and the
+standard does not require particular numerical accuracy.  There are sub-cubic
+algorithms (e.g. Strassen), and there are also cubic algorithms that are
+"blocked" to be more cache-friendly.  I believe that OpenBLAS and ATLAS use
+blocked cubic algorithms, based on the asymptotic GFLOP/s attributed to MKL,
+I would guess it uses blocked cubic algorithms too.
+
+My hope with this benchmark is that it can be used to develop fast, readable
+GEMM implementations. I'm curious, for example, if a readable, blocked
+algorithm in pure Python could be compiled to a reasonable-performing
+implementation.
+
+"""
+
 import numpy as np
 
 

From 3b7370db5d36e615cd55f04b3c8be4e2d02606c6 Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Thu, 15 Aug 2013 10:11:44 -0400
Subject: [PATCH 11/12] Wrote pairwise benchmark docstring

---
 pairwise/__init__.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pairwise/__init__.py b/pairwise/__init__.py
index ce70b6f..547724b 100644
--- a/pairwise/__init__.py
+++ b/pairwise/__init__.py
@@ -1,6 +1,16 @@
 # Authors: Olivier Grisel
 # License: MIT
 
+"""Computes the Euclidean distance between each pair of rows in a matrix.
+
+In LaTeX:
+    Y[i, j] = sqrt{ \sum_k (A[i, k] - B[j, k])^2 }
+
+This computation is a core routine of many machine learning algorithms that
+rely on neighbourhood computations.
+
+"""
+
 import numpy as np
 
 

From 79d41430999764625e849f891976e55134e0c61f Mon Sep 17 00:00:00 2001
From: James Bergstra <james.bergstra@gmail.com>
Date: Thu, 15 Aug 2013 10:14:15 -0400
Subject: [PATCH 12/12] Commented on disabling of pairwise_theano float32

---
 pairwise/pairwise_theano.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pairwise/pairwise_theano.py b/pairwise/pairwise_theano.py
index a13661f..431ef0d 100644
--- a/pairwise/pairwise_theano.py
+++ b/pairwise/pairwise_theano.py
@@ -19,6 +19,9 @@ def pairwise_theano_tensor_prepare(dtype):
 
 
 benchmarks = (
-    #pairwise_theano_tensor_prepare('float32'),
+    # -- disabling float32 to match the precision of the other
+    #    implementations (assuming that the benchmark problem is
+    #    to carry out computations in double precision).
+    # pairwise_theano_tensor_prepare('float32'),
     pairwise_theano_tensor_prepare('float64'),
 )