From 07e1adc110dc6d6adb03b2f7118928df754a34b4 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 10 Jul 2013 21:37:18 +0200 Subject: [PATCH 01/12] Basic argparse-based CLI parsing --- run_benchmarks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/run_benchmarks.py b/run_benchmarks.py index 4163fbe..d079a97 100644 --- a/run_benchmarks.py +++ b/run_benchmarks.py @@ -7,6 +7,7 @@ from collections import OrderedDict except: from ordereddict import OrderedDict +import argparse import json import os import traceback From 321fec2d2589b5ef16c03bb5412f818eafde2614 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 13 Aug 2013 10:07:04 -0400 Subject: [PATCH 02/12] init gemm benchmark dir --- gemm/__init__.py | 14 ++++++++++++++ gemm/gemm_python.py | 46 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 gemm/__init__.py create mode 100644 gemm/gemm_python.py diff --git a/gemm/__init__.py b/gemm/__init__.py new file mode 100644 index 0000000..000ffa7 --- /dev/null +++ b/gemm/__init__.py @@ -0,0 +1,14 @@ +# Authors: James Bergstra +# License: MIT + +import numpy as np + + +def make_env(shape=(256, 256, 256), seed=0, dtype=np.float): + rng = np.random.RandomState(seed) + A = np.asarray(rng.normal(size=(shape[0], shape[1])), dtype=dtype) + B = np.asarray(rng.normal(size=(shape[1], shape[2])), dtype=dtype) + C = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype) + alpha = 1.5 + beta = 0.3 + return (alpha, A, B, beta, C), {} diff --git a/gemm/gemm_python.py b/gemm/gemm_python.py new file mode 100644 index 0000000..021e4ee --- /dev/null +++ b/gemm/gemm_python.py @@ -0,0 +1,46 @@ +# Authors: James Bergstra +# License: MIT + +import numpy as np + + +def gemm_python_nested_for_loops(alpha, A, B, beta, C): + + M, N = C.shape + K = A.shape[1] + #"omp parallel for private(j, d, k, tmp)" + for i in range(M): + for j in range(N): + d = 0.0 + for k in range(K): + tmp = A[i, k] * B[j, k] + d += tmp + C[i, j] = alpha * d + beta * C[i, j] + return C + + +def gemm_python_inner_numpy(alpha, A, B, beta, C): + M, N = C.shape + for i in xrange(M): + for j in xrange(N): + C[i, j] *= beta + C[i, j] += alpha * np.dot(A[i, :], B[:, j]) + return C + + +def gemm_python_broadcast_numpy(alpha, A, B, beta, C): + return alpha * (A[:, None, :] * B.T[None, :, :]).sum(axis=2) + beta * C + + +def gemm_python_numpy_dot(alpha, A, B, beta, C): + C *= beta + C += alpha * np.dot(A, B) + return C + + +benchmarks = ( + gemm_python_nested_for_loops, + gemm_python_inner_numpy, + gemm_python_broadcast_numpy, + gemm_python_numpy_dot, +) From 182d8786e1f8f451ec78e76e5bd5c7ae10bb9abd Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Tue, 13 Aug 2013 18:06:07 -0400 Subject: [PATCH 03/12] init gemm pyopencl --- gemm/gemm_pyopencl.py | 295 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 gemm/gemm_pyopencl.py diff --git a/gemm/gemm_pyopencl.py b/gemm/gemm_pyopencl.py new file mode 100644 index 0000000..53bf0ff --- /dev/null +++ b/gemm/gemm_pyopencl.py @@ -0,0 +1,295 @@ +# Authors: James Bergstra +# License: MIT + +import sys +import time + +from mako.template import Template +import numpy as np +import pyopencl as cl + +mf = cl.mem_flags + +PROFILING = 1 + +ctx = cl.create_some_context() +if PROFILING: + queue = cl.CommandQueue( + ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) +else: + queue = cl.CommandQueue(ctx) + + +def ctype_from_dtype(dtype): + return { + 'float32': 'float', + 'float64': 'double', + }[str(dtype)] + + +def elemstrides(strides, dtype, vec=1, vecdim=-1): + size = { + 'float32': 4, + 'float64': 8, + }[str(dtype)] + # TODO: raise StrideError + for stride in strides: + assert stride % size == 0 + val_strides = tuple(int(s / size) for s in strides) + if vec == 1: + return val_strides + vecdim = range(len(strides))[vecdim] # -- make vecdim non-neg + for ii, val_stride in enumerate(val_strides): + if ii == vecdim: + assert val_stride == 1 + else: + assert val_stride % vec == 0 + vec_strides = [int(s / vec) for s in val_strides] + vec_strides[vecdim] = 1 + return tuple(vec_strides) + + + +def memoize(f): + cache = {} + def new_fn(*args, **kwargs): + key = args + tuple(sorted(kwargs.items())) + try: + return cache[key] + except KeyError: + rval = f(*args, **kwargs) + cache[key] = rval + return rval + new_fn.__name__ = f.__name__ + new_fn.memoize_cache = cache + return new_fn + + +@memoize +def gemm_cpu_prepare_reference(alpha, beta, M, N, K, dtype, + Astrides, Bstrides, Cstrides): + ctype = ctype_from_dtype(dtype) + (As0, As1) = elemstrides(Astrides, dtype) + (Bs0, Bs1) = elemstrides(Bstrides, dtype) + (Cs0, Cs1) = elemstrides(Cstrides, dtype) + prg = cl.Program(ctx, """ + __kernel void ref(__global %(ctype)s *A, + __global %(ctype)s *B, + __global %(ctype)s *C) + { + for(int mm = get_global_id(0); mm < %(M)s; mm += get_global_size(0)) + { + for(int nn = get_global_id(1); nn < %(N)s; nn += get_global_size(1)) + { + %(ctype)s buf = 0; + for (int kk = 0; kk < %(K)s; ++kk) + { + buf += A[mm * %(As0)s + kk * %(As1)s] + * B[kk * %(Bs0)s + nn * %(Bs1)s]; + } + C[mm * %(Cs0)s + nn * %(Cs1)s] *= %(beta)s; + C[mm * %(Cs0)s + nn * %(Cs1)s] += %(alpha)s * buf; + } + } + } + """ % locals()).build() + + return prg.ref + +class StrideError(Exception): + """StrideError""" + +class BlockingError(Exception): + """BlockingError""" + +vectorized_text = """ +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +__kernel void kern(__global ${ctype}${KB} *A, + __global ${ctype}${NB} *B, + __global ${ctype}${NB} *C) +{ + const ${ctype}${NB} beta = (${ctype}${NB})( + ${beta} + % for ii in range(NB - 1): + , ${beta} + % endfor + ); + ${ctype}${NB} tmp; + % for ii in range(MB): + ${ctype}${KB} Abuf${ii}; + ${ctype}${NB} Cbuf${ii}; + % endfor + ${ctype}${NB} Bbuf; + for(int mb = get_global_id(0); mb < ${NoMB}; mb += get_global_size(0)) + { + for(int nb = get_global_id(1); nb < ${NoNB}; nb += get_global_size(1)) + { + % for ii in range(MB): + Cbuf${ii} = (${ctype}${NB})( + 0 + % for foo in range(NB - 1): + , 0 + % endfor + ); + % endfor + + for (int kb = 0; kb < ${NoKB}; ++kb) + { + // load KB columns of A at a time + % for ii in range(MB): + Abuf${ii} = A[${As0} * (mb * ${MB} + ${ii}) + kb]; + % endfor + + % for kki in range(KB): + Bbuf = B[(kb * ${KB} + ${kki}) * ${Bs0} + nb]; + + % for ii in range(MB): + + tmp = (${ctype}${NB})( + Abuf${ii}.s${kki} + % for foo in range(NB - 1): + , Abuf${ii}.s${kki} + % endfor + ); + Cbuf${ii} = mad(tmp, Bbuf, Cbuf${ii}); + % endfor + + % endfor + } + + % if alpha != 1: + % for ii in range(MB): + Cbuf${ii} *= (${ctype}${NB})( + ${alpha} + % for foo in range(NB - 1): + , ${alpha} + % endfor + ); + % endfor + % endif + + % for ii in range(MB): + % if beta == 0: + C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] = Cbuf${ii}; + % elif beta == 1: + C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] += Cbuf${ii}; + % else: + C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] = mad(beta, C[(mb* ${MB} + ${ii}) * ${Cs0} + nb], Cbuf${ii}); + % endif + % endfor + } + } +} + """ + + +@memoize +def gemm_cpu_prepare_vectorized(alpha, beta, M, N, K, dtype, + Astrides, Bstrides, Cstrides, MB, NB, KB): + ctype = ctype_from_dtype(dtype) + (As0, As1) = elemstrides(Astrides, dtype, MB) + (Bs0, Bs1) = elemstrides(Bstrides, dtype, NB) + (Cs0, Cs1) = elemstrides(Cstrides, dtype, MB) + if MB: assert As1 == 1 + if NB: assert Bs1 == 1 + if MB: assert Cs1 == 1 + NoMB = M // MB + NoNB = N // NB + NoKB = K // KB + if M != MB * NoMB: + raise BlockingError() + if N != NB * NoNB: + raise BlockingError() + if K != KB * NoKB: + raise BlockingError() + text = Template(vectorized_text, output_encoding='ascii').render(**locals()) + for ii, line in enumerate(text.split('\n')): + print ii, line + prg = cl.Program(ctx, text).build() + print 'built!' + + return prg.kern + + +comptimes = [] +def gemm_pyopencl_cpu(alpha, A, B, beta, C): + kern = None + global_shape = (4, 4) # enough for different cores + local_shape = (1, 1) # I think this does nothing on CPU (true?) + + # TODO: some values of these constants that should work, do not. + # e.g. 16, 8, 8 -> wrong answer + # 4, 4, 8 -> segfault + for MB in [8]: + for NB in [8]: + for KB in [8]: + if kern: + continue + try: + kern = gemm_cpu_prepare_vectorized( + alpha, beta, + C.shape[0], C.shape[1], A.shape[1], + A.dtype, + A.strides, B.strides, C.strides, + MB=MB, NB=NB, KB=KB) + print 'Using kernel for MB=%i NB=%i KB=%i' % (MB, NB, KB) + except StrideError: + pass + if kern is None: + kern = gemm_cpu_prepare_reference(alpha, beta, + C.shape[0], C.shape[1], A.shape[1], + A.dtype, + A.strides, B.strides, C.strides) + + A_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=A) + B_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=B) + C_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=C) + ev = kern(queue, global_shape, local_shape, A_buf, B_buf, C_buf) + cl.enqueue_copy(queue, C, C_buf) + queue.finish() + if PROFILING: + comptimes.append(1e-9 * (ev.profile.end - ev.profile.start)) + print 'computation time', min(comptimes) + + +benchmarks = ( + gemm_pyopencl_cpu, +) + +NNN = 1024 #* 2 +def main(shape=(NNN, NNN, NNN), seed=0, dtype=np.float64): + rng = np.random.RandomState(seed) + A = np.asarray(rng.normal(size=(shape[0], shape[1])), dtype=dtype) + B = np.asarray(rng.normal(size=(shape[1], shape[2])), dtype=dtype) + C = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype) + C2 = np.empty_like(C) + Z = np.ones(10 * 1024 * 1024) + alpha = 1.0 + beta = 0.0 + FLOPS = shape[0] * shape[1] * (shape[2] + 1) * 2 + for i in range(500): + rng.seed(i) + C[:] = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype) + + t0 = time.time() + gemm_pyopencl_cpu(alpha, A, B, beta, C) + t1 = time.time() + print 'pyopencl time: ', (t1 - t0), 'GFlops', FLOPS / (t1 - t0) / (1000 ** 3) + + rng.seed(i) + C2[:] = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype) + + t0 = time.time() + C3 = alpha * np.dot( A, B) + beta * C2 + t1 = time.time() + print 'np.dot time: ', (t1 - t0), 'GFlops', FLOPS / (t1 - t0) / (1000 ** 3) + + print np.max(abs(C - C3)) + assert np.allclose(C, C3) + # -- clear processor cache + Z.sum() + time.sleep(1) + +if __name__ == '__main__': + sys.exit(main()) From 16805c6b7a077606ce9033ee757adb303afbaccb Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Wed, 14 Aug 2013 09:49:37 -0400 Subject: [PATCH 04/12] gemm_pyopencl -> BSD-3 --- gemm/gemm_pyopencl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gemm/gemm_pyopencl.py b/gemm/gemm_pyopencl.py index 53bf0ff..d0ce9fd 100644 --- a/gemm/gemm_pyopencl.py +++ b/gemm/gemm_pyopencl.py @@ -1,5 +1,5 @@ # Authors: James Bergstra -# License: MIT +# License: BSD-3 import sys import time @@ -10,7 +10,7 @@ mf = cl.mem_flags -PROFILING = 1 +PROFILING = 0 ctx = cl.create_some_context() if PROFILING: @@ -257,8 +257,8 @@ def gemm_pyopencl_cpu(alpha, A, B, beta, C): gemm_pyopencl_cpu, ) -NNN = 1024 #* 2 -def main(shape=(NNN, NNN, NNN), seed=0, dtype=np.float64): +NNN = 1024 * 2 +def main(shape=(NNN, NNN, NNN), seed=0, dtype=np.float32): rng = np.random.RandomState(seed) A = np.asarray(rng.normal(size=(shape[0], shape[1])), dtype=dtype) B = np.asarray(rng.normal(size=(shape[1], shape[2])), dtype=dtype) @@ -276,6 +276,7 @@ def main(shape=(NNN, NNN, NNN), seed=0, dtype=np.float64): gemm_pyopencl_cpu(alpha, A, B, beta, C) t1 = time.time() print 'pyopencl time: ', (t1 - t0), 'GFlops', FLOPS / (t1 - t0) / (1000 ** 3) + continue rng.seed(i) C2[:] = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype) From f01dbb16f02d637dad45a0296583fe98366cf5ec Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Wed, 14 Aug 2013 11:46:08 -0400 Subject: [PATCH 05/12] several gemm implementations --- gemm/__init__.py | 13 +- gemm/gemm_cython.pyx | 32 +++++ gemm/gemm_numba.py | 11 ++ gemm/gemm_parakeet.py | 13 ++ gemm/gemm_pyopencl.py | 295 +----------------------------------------- gemm/gemm_python.py | 3 +- gemm/gemm_theano.py | 45 +++++++ 7 files changed, 113 insertions(+), 299 deletions(-) create mode 100644 gemm/gemm_cython.pyx create mode 100644 gemm/gemm_numba.py create mode 100644 gemm/gemm_parakeet.py create mode 100644 gemm/gemm_theano.py diff --git a/gemm/__init__.py b/gemm/__init__.py index 000ffa7..7ee8d1a 100644 --- a/gemm/__init__.py +++ b/gemm/__init__.py @@ -4,11 +4,12 @@ import numpy as np -def make_env(shape=(256, 256, 256), seed=0, dtype=np.float): +def make_env(M=512, N=512, K=512, seed=0, dtype=np.float64, + alpha=1.5, + beta=0.5): rng = np.random.RandomState(seed) - A = np.asarray(rng.normal(size=(shape[0], shape[1])), dtype=dtype) - B = np.asarray(rng.normal(size=(shape[1], shape[2])), dtype=dtype) - C = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype) - alpha = 1.5 - beta = 0.3 + A = np.asarray(rng.normal(size=(M, K)), dtype=dtype) + B = np.asarray(rng.normal(size=(K, N)), dtype=dtype) + C = np.asarray(rng.normal(size=(M, N)), dtype=dtype) return (alpha, A, B, beta, C), {} + diff --git a/gemm/gemm_cython.pyx b/gemm/gemm_cython.pyx new file mode 100644 index 0000000..2f597be --- /dev/null +++ b/gemm/gemm_cython.pyx @@ -0,0 +1,32 @@ + +# Authors: Jake Vanderplas, Olivier Grisel +# License: MIT + +import numpy as np +cimport cython +from libc.math cimport sqrt + +@cython.boundscheck(False) +@cython.wraparound(False) +def gemm_cython_for_loops( + double alpha, + double[:, ::1] A, + double[:, ::1] B, + double beta, + double[:, ::1] C, + ): + cdef int M = C.shape[0] + cdef int N = C.shape[1] + cdef int K = A.shape[1] + cdef double tmp, d + for i in range(M): + for j in range(N): + d = 0.0 + for k in range(K): + d += A[i, k] * B[k, j] + C[i, j] = alpha * d + beta * C[i, j] + + +benchmarks = ( + gemm_cython_for_loops, +) diff --git a/gemm/gemm_numba.py b/gemm/gemm_numba.py new file mode 100644 index 0000000..702ca38 --- /dev/null +++ b/gemm/gemm_numba.py @@ -0,0 +1,11 @@ +# Authors: Olivier Grisel +# License: MIT + +from gemm import gemm_python +from numba import autojit + + +benchmarks = ( + ("gemm_numba_nested_for_loops", + autojit(gemm_python.gemm_python_nested_for_loops)), +) diff --git a/gemm/gemm_parakeet.py b/gemm/gemm_parakeet.py new file mode 100644 index 0000000..8c189d1 --- /dev/null +++ b/gemm/gemm_parakeet.py @@ -0,0 +1,13 @@ +# Authors: Olivier Grisel +# License: MIT + +from gemm import gemm_python +from parakeet import jit + + +benchmarks = ( + ("gemm_parakeet_nested_for_loops", + jit(gemm_python.gemm_python_nested_for_loops)), + ("gemm_parakeet_inner_numpy", + jit(gemm_python.gemm_python_inner_numpy)), +) diff --git a/gemm/gemm_pyopencl.py b/gemm/gemm_pyopencl.py index d0ce9fd..a1a9eaf 100644 --- a/gemm/gemm_pyopencl.py +++ b/gemm/gemm_pyopencl.py @@ -1,296 +1,9 @@ -# Authors: James Bergstra -# License: BSD-3 - -import sys -import time - -from mako.template import Template -import numpy as np -import pyopencl as cl - -mf = cl.mem_flags - -PROFILING = 0 - -ctx = cl.create_some_context() -if PROFILING: - queue = cl.CommandQueue( - ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) -else: - queue = cl.CommandQueue(ctx) - - -def ctype_from_dtype(dtype): - return { - 'float32': 'float', - 'float64': 'double', - }[str(dtype)] - - -def elemstrides(strides, dtype, vec=1, vecdim=-1): - size = { - 'float32': 4, - 'float64': 8, - }[str(dtype)] - # TODO: raise StrideError - for stride in strides: - assert stride % size == 0 - val_strides = tuple(int(s / size) for s in strides) - if vec == 1: - return val_strides - vecdim = range(len(strides))[vecdim] # -- make vecdim non-neg - for ii, val_stride in enumerate(val_strides): - if ii == vecdim: - assert val_stride == 1 - else: - assert val_stride % vec == 0 - vec_strides = [int(s / vec) for s in val_strides] - vec_strides[vecdim] = 1 - return tuple(vec_strides) - - - -def memoize(f): - cache = {} - def new_fn(*args, **kwargs): - key = args + tuple(sorted(kwargs.items())) - try: - return cache[key] - except KeyError: - rval = f(*args, **kwargs) - cache[key] = rval - return rval - new_fn.__name__ = f.__name__ - new_fn.memoize_cache = cache - return new_fn - - -@memoize -def gemm_cpu_prepare_reference(alpha, beta, M, N, K, dtype, - Astrides, Bstrides, Cstrides): - ctype = ctype_from_dtype(dtype) - (As0, As1) = elemstrides(Astrides, dtype) - (Bs0, Bs1) = elemstrides(Bstrides, dtype) - (Cs0, Cs1) = elemstrides(Cstrides, dtype) - prg = cl.Program(ctx, """ - __kernel void ref(__global %(ctype)s *A, - __global %(ctype)s *B, - __global %(ctype)s *C) - { - for(int mm = get_global_id(0); mm < %(M)s; mm += get_global_size(0)) - { - for(int nn = get_global_id(1); nn < %(N)s; nn += get_global_size(1)) - { - %(ctype)s buf = 0; - for (int kk = 0; kk < %(K)s; ++kk) - { - buf += A[mm * %(As0)s + kk * %(As1)s] - * B[kk * %(Bs0)s + nn * %(Bs1)s]; - } - C[mm * %(Cs0)s + nn * %(Cs1)s] *= %(beta)s; - C[mm * %(Cs0)s + nn * %(Cs1)s] += %(alpha)s * buf; - } - } - } - """ % locals()).build() - - return prg.ref - -class StrideError(Exception): - """StrideError""" - -class BlockingError(Exception): - """BlockingError""" - -vectorized_text = """ -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -__kernel void kern(__global ${ctype}${KB} *A, - __global ${ctype}${NB} *B, - __global ${ctype}${NB} *C) -{ - const ${ctype}${NB} beta = (${ctype}${NB})( - ${beta} - % for ii in range(NB - 1): - , ${beta} - % endfor - ); - ${ctype}${NB} tmp; - % for ii in range(MB): - ${ctype}${KB} Abuf${ii}; - ${ctype}${NB} Cbuf${ii}; - % endfor - ${ctype}${NB} Bbuf; - for(int mb = get_global_id(0); mb < ${NoMB}; mb += get_global_size(0)) - { - for(int nb = get_global_id(1); nb < ${NoNB}; nb += get_global_size(1)) - { - % for ii in range(MB): - Cbuf${ii} = (${ctype}${NB})( - 0 - % for foo in range(NB - 1): - , 0 - % endfor - ); - % endfor - - for (int kb = 0; kb < ${NoKB}; ++kb) - { - // load KB columns of A at a time - % for ii in range(MB): - Abuf${ii} = A[${As0} * (mb * ${MB} + ${ii}) + kb]; - % endfor - - % for kki in range(KB): - Bbuf = B[(kb * ${KB} + ${kki}) * ${Bs0} + nb]; - - % for ii in range(MB): - - tmp = (${ctype}${NB})( - Abuf${ii}.s${kki} - % for foo in range(NB - 1): - , Abuf${ii}.s${kki} - % endfor - ); - Cbuf${ii} = mad(tmp, Bbuf, Cbuf${ii}); - % endfor - - % endfor - } - - % if alpha != 1: - % for ii in range(MB): - Cbuf${ii} *= (${ctype}${NB})( - ${alpha} - % for foo in range(NB - 1): - , ${alpha} - % endfor - ); - % endfor - % endif - - % for ii in range(MB): - % if beta == 0: - C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] = Cbuf${ii}; - % elif beta == 1: - C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] += Cbuf${ii}; - % else: - C[(mb * ${MB} + ${ii}) * ${Cs0} + nb] = mad(beta, C[(mb* ${MB} + ${ii}) * ${Cs0} + nb], Cbuf${ii}); - % endif - % endfor - } - } -} - """ - - -@memoize -def gemm_cpu_prepare_vectorized(alpha, beta, M, N, K, dtype, - Astrides, Bstrides, Cstrides, MB, NB, KB): - ctype = ctype_from_dtype(dtype) - (As0, As1) = elemstrides(Astrides, dtype, MB) - (Bs0, Bs1) = elemstrides(Bstrides, dtype, NB) - (Cs0, Cs1) = elemstrides(Cstrides, dtype, MB) - if MB: assert As1 == 1 - if NB: assert Bs1 == 1 - if MB: assert Cs1 == 1 - NoMB = M // MB - NoNB = N // NB - NoKB = K // KB - if M != MB * NoMB: - raise BlockingError() - if N != NB * NoNB: - raise BlockingError() - if K != KB * NoKB: - raise BlockingError() - text = Template(vectorized_text, output_encoding='ascii').render(**locals()) - for ii, line in enumerate(text.split('\n')): - print ii, line - prg = cl.Program(ctx, text).build() - print 'built!' - - return prg.kern - - -comptimes = [] -def gemm_pyopencl_cpu(alpha, A, B, beta, C): - kern = None - global_shape = (4, 4) # enough for different cores - local_shape = (1, 1) # I think this does nothing on CPU (true?) - - # TODO: some values of these constants that should work, do not. - # e.g. 16, 8, 8 -> wrong answer - # 4, 4, 8 -> segfault - for MB in [8]: - for NB in [8]: - for KB in [8]: - if kern: - continue - try: - kern = gemm_cpu_prepare_vectorized( - alpha, beta, - C.shape[0], C.shape[1], A.shape[1], - A.dtype, - A.strides, B.strides, C.strides, - MB=MB, NB=NB, KB=KB) - print 'Using kernel for MB=%i NB=%i KB=%i' % (MB, NB, KB) - except StrideError: - pass - if kern is None: - kern = gemm_cpu_prepare_reference(alpha, beta, - C.shape[0], C.shape[1], A.shape[1], - A.dtype, - A.strides, B.strides, C.strides) - - A_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=A) - B_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=B) - C_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=C) - ev = kern(queue, global_shape, local_shape, A_buf, B_buf, C_buf) - cl.enqueue_copy(queue, C, C_buf) - queue.finish() - if PROFILING: - comptimes.append(1e-9 * (ev.profile.end - ev.profile.start)) - print 'computation time', min(comptimes) +# Author: James Bergstra +# License: MIT +from pybench_pyopencl import gemm_pyopencl benchmarks = ( - gemm_pyopencl_cpu, + gemm_pyopencl.gemm_pyopencl_cpu, ) -NNN = 1024 * 2 -def main(shape=(NNN, NNN, NNN), seed=0, dtype=np.float32): - rng = np.random.RandomState(seed) - A = np.asarray(rng.normal(size=(shape[0], shape[1])), dtype=dtype) - B = np.asarray(rng.normal(size=(shape[1], shape[2])), dtype=dtype) - C = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype) - C2 = np.empty_like(C) - Z = np.ones(10 * 1024 * 1024) - alpha = 1.0 - beta = 0.0 - FLOPS = shape[0] * shape[1] * (shape[2] + 1) * 2 - for i in range(500): - rng.seed(i) - C[:] = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype) - - t0 = time.time() - gemm_pyopencl_cpu(alpha, A, B, beta, C) - t1 = time.time() - print 'pyopencl time: ', (t1 - t0), 'GFlops', FLOPS / (t1 - t0) / (1000 ** 3) - continue - - rng.seed(i) - C2[:] = np.asarray(rng.normal(size=(shape[0], shape[2])), dtype=dtype) - - t0 = time.time() - C3 = alpha * np.dot( A, B) + beta * C2 - t1 = time.time() - print 'np.dot time: ', (t1 - t0), 'GFlops', FLOPS / (t1 - t0) / (1000 ** 3) - - print np.max(abs(C - C3)) - assert np.allclose(C, C3) - # -- clear processor cache - Z.sum() - time.sleep(1) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/gemm/gemm_python.py b/gemm/gemm_python.py index 021e4ee..9a7ce52 100644 --- a/gemm/gemm_python.py +++ b/gemm/gemm_python.py @@ -4,8 +4,8 @@ import numpy as np +# -- too slow to run directly, but good for JIT by other systems def gemm_python_nested_for_loops(alpha, A, B, beta, C): - M, N = C.shape K = A.shape[1] #"omp parallel for private(j, d, k, tmp)" @@ -39,7 +39,6 @@ def gemm_python_numpy_dot(alpha, A, B, beta, C): benchmarks = ( - gemm_python_nested_for_loops, gemm_python_inner_numpy, gemm_python_broadcast_numpy, gemm_python_numpy_dot, diff --git a/gemm/gemm_theano.py b/gemm/gemm_theano.py new file mode 100644 index 0000000..e6948bd --- /dev/null +++ b/gemm/gemm_theano.py @@ -0,0 +1,45 @@ +# Authors: James Bergstra +# License: MIT +import theano +import theano.tensor as TT + + +def gemm_theano_tensor_prepare(dtype): + alpha = TT.scalar(dtype=str(dtype)) + beta = TT.scalar(dtype=str(dtype)) + A = TT.matrix(dtype=str(dtype)) + B = TT.matrix(dtype=str(dtype)) + C = TT.matrix(dtype=str(dtype)) + Z = alpha * TT.sum(A[:, None, :] * B, axis=2) + beta * C + name = 'gemm_theano_broadcast_' + dtype + Cin = theano.In(C, mutable=True, borrow=True) + rval = theano.function([alpha, A, B, beta, Cin], + theano.Out(Z, borrow=True), + allow_input_downcast=True, name=name) + rval.__name__ = name + return rval + + +def gemm_theano_blas_prepare(dtype): + alpha = TT.scalar(dtype=str(dtype)) + beta = TT.scalar(dtype=str(dtype)) + A = TT.matrix(dtype=str(dtype)) + B = TT.matrix(dtype=str(dtype)) + C = TT.matrix(dtype=str(dtype)) + Z = alpha * TT.dot(A, B) + beta * C + Cin = theano.In(C, mutable=True, borrow=True) + name = 'gemm_theano_blas_' + dtype + rval = theano.function([alpha, A, B, beta, Cin], + theano.Out(Z, borrow=True), + allow_input_downcast=True, name=name) + rval.__name__ = name + return rval + + +benchmarks = ( + #gemm_theano_tensor_prepare('float32'), + gemm_theano_tensor_prepare('float64'), + #gemm_theano_blas_prepare('float32'), + gemm_theano_blas_prepare('float64'), +) + From de77f07c061ce4947ea51d41b671a54a363619c4 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Wed, 14 Aug 2013 11:51:18 -0400 Subject: [PATCH 06/12] implementing pairwise_pyopencl like gemm_pyopencl --- pairwise/pairwise_pyopencl.py | 104 +++------------------------------- 1 file changed, 7 insertions(+), 97 deletions(-) diff --git a/pairwise/pairwise_pyopencl.py b/pairwise/pairwise_pyopencl.py index 6e4e40d..c6a6669 100644 --- a/pairwise/pairwise_pyopencl.py +++ b/pairwise/pairwise_pyopencl.py @@ -1,105 +1,15 @@ -# Authors: James Bergstra +# Author: James Bergstra # License: MIT import numpy as np -import time -import pyopencl as cl -import numpy +from pybench_pyopencl import pairwise_pyopencl -mf = cl.mem_flags - -PROFILING = 0 - -ctx = cl.create_some_context() -if PROFILING: - queue = cl.CommandQueue( - ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) -else: - queue = cl.CommandQueue(ctx) - -_cache = {} - -def pairwise_pyopencl_cpu_prepare(shp, dtype): - N, D = shp - ctype = { - 'float32': 'float', - 'float64': 'double', - }[str(dtype)] - - odd_d = "" if 0 == D % 2 else """ - __global %(ctype)s * a1 = (__global %(ctype)s*) (a); - %(ctype)s diff = a1[(n0 + 1) * %(D)s - 1] - a1[(m0 + 1) * %(D)s - 1]; - buf.s0 += diff * diff; - """ - - prg = cl.Program(ctx, """ - __kernel void lower(__global %(ctype)s2 *a, __global %(ctype)s *c) - { - for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0)) - { - for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1)) - { - if (n0 < m0) continue; - __global %(ctype)s2 *an = a + n0 * %(D)s / 2; - __global %(ctype)s2 *am = a + m0 * %(D)s / 2; - %(ctype)s2 buf = 0; - for (int d = 0; d < %(D)s/2; ++d) - { - %(ctype)s2 diff = am[d] - an[d]; - buf += diff * diff; - } - %(odd_d)s; - c[m0 * %(N)s + n0] = sqrt(buf.s0 + buf.s1); - } - } - } - __kernel void upper(__global %(ctype)s *a, __global %(ctype)s *c) - { - for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0)) - { - for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1)) - { - if (n0 >= m0) continue; - c[m0 * %(N)s + n0] = c[n0 * %(N)s + m0]; - } - } - } - """ % locals()).build() - - return prg.lower, prg.upper - - -comptimes = [] def pairwise_pyopencl_cpu(data): - data = np.asarray(data, order='C') - N, D = data.shape - try: - lower, upper = _cache[(data.shape, data.dtype)] - except: - lower, upper = pairwise_pyopencl_cpu_prepare(data.shape, data.dtype) - _cache[(data.shape, data.dtype)] = lower, upper - data_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=data) - dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, N * N * data.dtype.itemsize) - try: - rval, _ = cl.enqueue_map_buffer(queue, dest_buf, cl.map_flags.READ, - offset=0, shape=(N, N), dtype=data.dtype) - need_copy = False - except TypeError: #OSX's OCL needs this? - rval = np.empty((N, N), dtype=data.dtype) - need_copy = True - lower(queue, (N, 1), (1, 1), data_buf, dest_buf) - upper(queue, (4, 4), (1, 1), data_buf, dest_buf) - if need_copy: - cl.enqueue_copy(queue, rval, dest_buf) - else: - queue.finish() - if PROFILING: - comptimes.append(1e-9 * (ev.profile.end - ev.profile.start)) - print 'computation time', min(comptimes) - return rval - + M, K = data.shape + out = np.zeros((M, M), dtype=data.dtype) + return pairwise_pyopencl.pairwise_pyopencl_cpu(data, data.T, out) benchmarks = ( - pairwise_pyopencl_cpu, + pairwise_pyopencl_cpu, ) + From 1105d1515327b3a84a18e7c1e6468d3902c18b00 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Wed, 14 Aug 2013 13:41:02 -0400 Subject: [PATCH 07/12] comment with location of pybench_pyopencl --- gemm/gemm_pyopencl.py | 1 + pairwise/pairwise_pyopencl.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/gemm/gemm_pyopencl.py b/gemm/gemm_pyopencl.py index a1a9eaf..235dbe9 100644 --- a/gemm/gemm_pyopencl.py +++ b/gemm/gemm_pyopencl.py @@ -1,6 +1,7 @@ # Author: James Bergstra # License: MIT +# -- https://github.com/jaberg/python-benchmarks-pyopencl from pybench_pyopencl import gemm_pyopencl benchmarks = ( diff --git a/pairwise/pairwise_pyopencl.py b/pairwise/pairwise_pyopencl.py index c6a6669..6cbce99 100644 --- a/pairwise/pairwise_pyopencl.py +++ b/pairwise/pairwise_pyopencl.py @@ -2,6 +2,8 @@ # License: MIT import numpy as np + +# -- https://github.com/jaberg/python-benchmarks-pyopencl from pybench_pyopencl import pairwise_pyopencl def pairwise_pyopencl_cpu(data): From b89f4841afc0ea40d6ab01a238e24ed085654ade Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Wed, 14 Aug 2013 13:44:15 -0400 Subject: [PATCH 08/12] pairwise: removed "cheating" gemm-based impls --- pairwise/pairwise_python.py | 7 ------- pairwise/pairwise_theano.py | 16 +--------------- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/pairwise/pairwise_python.py b/pairwise/pairwise_python.py index 21698fe..c792783 100644 --- a/pairwise/pairwise_python.py +++ b/pairwise/pairwise_python.py @@ -31,15 +31,8 @@ def pairwise_python_broadcast_numpy(data): return np.sqrt(((data[:, None, :] - data) ** 2).sum(axis=2)) -def pairwise_python_numpy_dot(data): - X_norm_2 = (data ** 2).sum(axis=1) - dists = np.sqrt(2 * X_norm_2 - np.dot(data, data.T)) - return dists - - benchmarks = ( pairwise_python_nested_for_loops, pairwise_python_inner_numpy, pairwise_python_broadcast_numpy, - pairwise_python_numpy_dot, ) diff --git a/pairwise/pairwise_theano.py b/pairwise/pairwise_theano.py index 367f545..a13661f 100644 --- a/pairwise/pairwise_theano.py +++ b/pairwise/pairwise_theano.py @@ -18,21 +18,7 @@ def pairwise_theano_tensor_prepare(dtype): return rval -def pairwise_theano_blas_prepare(dtype): - X = TT.matrix(dtype=str(dtype)) - X_norm_2 = (X ** 2).sum(axis=1) - dists = TT.sqrt(2 * X_norm_2 - TT.dot(X, X.T)) - name = 'pairwise_theano_blas_' + dtype - rval = theano.function([X], - theano.Out(dists, borrow=True), - allow_input_downcast=True, name=name) - rval.__name__ = name - return rval - - benchmarks = ( - pairwise_theano_tensor_prepare('float32'), + #pairwise_theano_tensor_prepare('float32'), pairwise_theano_tensor_prepare('float64'), - pairwise_theano_blas_prepare('float32'), - pairwise_theano_blas_prepare('float64'), ) From 8f3b24d08d080e4c5931f60da22a008e81bd8833 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Wed, 14 Aug 2013 19:05:23 -0400 Subject: [PATCH 09/12] pairwise_pyopencl init output empty instead of 0 --- pairwise/pairwise_pyopencl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pairwise/pairwise_pyopencl.py b/pairwise/pairwise_pyopencl.py index 6cbce99..983195c 100644 --- a/pairwise/pairwise_pyopencl.py +++ b/pairwise/pairwise_pyopencl.py @@ -8,7 +8,7 @@ def pairwise_pyopencl_cpu(data): M, K = data.shape - out = np.zeros((M, M), dtype=data.dtype) + out = np.empty((M, M), dtype=data.dtype, order='C') return pairwise_pyopencl.pairwise_pyopencl_cpu(data, data.T, out) benchmarks = ( From 2f23d945ce6060a2169a0fb18b95c6202a2ed889 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Thu, 15 Aug 2013 10:11:32 -0400 Subject: [PATCH 10/12] Wrote gemm benchmark docstring --- gemm/__init__.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/gemm/__init__.py b/gemm/__init__.py index 7ee8d1a..de0b9a7 100644 --- a/gemm/__init__.py +++ b/gemm/__init__.py @@ -1,6 +1,35 @@ # Authors: James Bergstra # License: MIT +"""Computes the GEMM routine from the BLAS standard. + +BLAS defines the xGEMM (DGEMM, SGEMM, CGEMM, ZGEMM) operations as +dense matrix multiplication and accumulation as follows: + +C <- alpha A x B + beta C + +Here (alpha, beta) are scalars and (A, B, C) are matrices. + +This operation is one of the most widely-used and most-studied kernels in +high-performance computing, and BLAS implementations such as OpenBLAS, ATLAS, +and MKL provide highly optimized implementations of this operation. GEMM +implementations provide a real-life measure of peak performance on a +particular platform. + +Note that the GEMM interface does not actually describe an algorithm, and the +standard does not require particular numerical accuracy. There are sub-cubic +algorithms (e.g. Strassen), and there are also cubic algorithms that are +"blocked" to be more cache-friendly. I believe that OpenBLAS and ATLAS use +blocked cubic algorithms, based on the asymptotic GFLOP/s attributed to MKL, +I would guess it uses blocked cubic algorithms too. + +My hope with this benchmark is that it can be used to develop fast, readable +GEMM implementations. I'm curious, for example, if a readable, blocked +algorithm in pure Python could be compiled to a reasonable-performing +implementation. + +""" + import numpy as np From 3b7370db5d36e615cd55f04b3c8be4e2d02606c6 Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Thu, 15 Aug 2013 10:11:44 -0400 Subject: [PATCH 11/12] Wrote pairwise benchmark docstring --- pairwise/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pairwise/__init__.py b/pairwise/__init__.py index ce70b6f..547724b 100644 --- a/pairwise/__init__.py +++ b/pairwise/__init__.py @@ -1,6 +1,16 @@ # Authors: Olivier Grisel # License: MIT +"""Computes the Euclidean distance between each pair of rows in a matrix. + +In LaTeX: + Y[i, j] = sqrt{ \sum_k (A[i, k] - B[j, k])^2 } + +This computation is a core routine of many machine learning algorithms that +rely on neighbourhood computations. + +""" + import numpy as np From 79d41430999764625e849f891976e55134e0c61f Mon Sep 17 00:00:00 2001 From: James Bergstra Date: Thu, 15 Aug 2013 10:14:15 -0400 Subject: [PATCH 12/12] Commented on disabling of pairwise_theano float32 --- pairwise/pairwise_theano.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pairwise/pairwise_theano.py b/pairwise/pairwise_theano.py index a13661f..431ef0d 100644 --- a/pairwise/pairwise_theano.py +++ b/pairwise/pairwise_theano.py @@ -19,6 +19,9 @@ def pairwise_theano_tensor_prepare(dtype): benchmarks = ( - #pairwise_theano_tensor_prepare('float32'), + # -- disabling float32 to match the precision of the other + # implementations (assuming that the benchmark problem is + # to carry out computations in double precision). + # pairwise_theano_tensor_prepare('float32'), pairwise_theano_tensor_prepare('float64'), )