Merge pull request #71 from GlowingScrewdriver/clisp-to-ptx

Running C-Lisp code on Nvidia GPUs
chsasank · Jul 18, 2024 · 473274a · 473274a
2 parents 7e70722 + 031313e
commit 473274a
Show file tree

Hide file tree

Showing 11 changed files with 367 additions and 3 deletions.
diff --git a/src/backend/brilisp.py b/src/backend/brilisp.py
@@ -26,7 +26,12 @@ def gen_function(expr):
 
 def gen_type(typ):
     if is_list(typ):
-        return {typ[0]: gen_type(typ[1])}
+        assert typ[0] == "ptr"
+        retval = {"ptr": gen_type(typ[1])}
+        if len(typ) > 2 and is_list(typ[2]):
+            assert typ[2][0] == "addrspace"
+            retval["addrspace"] = typ[2][1]
+        return retval
     else:
         return typ
 

diff --git a/src/backend/llvm.py b/src/backend/llvm.py
@@ -46,7 +46,9 @@ def generate(self, bril_prog):
     def gen_type(self, type):
         if isinstance(type, dict):
             if "ptr" in type:
-                return self.gen_type(type["ptr"]).as_pointer()
+                addrspace = type.get("addrspace", 0)
+                type_obj = self.gen_type(type["ptr"]).as_pointer(addrspace)
+                return type_obj
             else:
                 raise CodegenError(f"Unknown type {type}")
         elif type in ["int", "int32"]:
@@ -126,7 +128,7 @@ def gen_instructions(self, instrs):
             "sitofp": "sitofp",
             "ptrtoint": "ptrtoint",
             "inttoptr": "inttoptr",
-            "bitcast": "bitcast"
+            "bitcast": "bitcast",
         }
 
         def gen_label(instr):

diff --git a/src/backend/tests/brilisp/addrspace.out b/src/backend/tests/brilisp/addrspace.out
@@ -0,0 +1,19 @@
+; ModuleID = ""
+target triple = "unknown-unknown-unknown"
+target datalayout = ""
+
+define void @"add5"(float addrspace(1)* %"a")
+{
+alloca:
+  %"a.1" = alloca float addrspace(1)*
+  %"five" = alloca float
+  br label %"entry"
+entry:
+  store float addrspace(1)* %"a", float addrspace(1)** %"a.1"
+  store float 0x4014000000000000, float* %"five"
+  %".5" = load float addrspace(1)*, float addrspace(1)** %"a.1"
+  %".6" = load float, float* %"five"
+  store float %".6", float addrspace(1)* %".5"
+  ret void
+}
+
diff --git a/src/backend/tests/brilisp/addrspace.sexp b/src/backend/tests/brilisp/addrspace.sexp
@@ -0,0 +1,7 @@
+;; CMD: guile ../../utils/sexp-json.scm < {filename} | python ../../brilisp.py | python ../../llvm.py | sed -e 's/alloca-[a-z]*/alloca/' -e 's/entry-[a-z]*/entry/'
+
+(brilisp
+    (define ((add5 void) (a (ptr float (addrspace 1))))
+        (set (five float) (const 5.0))
+        (store a five)
+        (ret)))
diff --git a/src/backend/tests/kernelprog/README.md b/src/backend/tests/kernelprog/README.md
@@ -0,0 +1,15 @@
+## C-Lisp PTX Generation Proof-of-Concept
+
+Each subdirectory here has a device-side kernel, stored as `<name>.sexp`, along with
+a host-side driver, stored as `<name>.driver.c`. `<name>.driver.c` files contain code
+to JIT-compile and launch a kernel, along with a reference implementation against which
+results are compared.
+
+To test a kernel, run `make <kernel name>.run` from the kernel's directory. This
+will
+* Compile the kernel from C-Lisp to PTX
+* Compile the driver from C to an executable
+* Run the driver, which in turn will
+  - Initialize random inputs
+  - JIT the kernel and launch it
+  - Run the reference implementation and compare results
diff --git a/src/backend/tests/kernelprog/matmul/Makefile b/src/backend/tests/kernelprog/matmul/Makefile
@@ -0,0 +1 @@
+../vecadd/Makefile
diff --git a/src/backend/tests/kernelprog/matmul/matmul.driver.c b/src/backend/tests/kernelprog/matmul/matmul.driver.c
@@ -0,0 +1,115 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+
+/* Host-side code to launch the kernel function and
+   validate the results against those of a reference function */
+
+// Reference implementation, defined per kernel
+void ref_kernel(float * a, float * b, float * res, int N) {
+  for (int i = 0; i < N; i++)
+    for (int j = 0; j < N; j++) {
+      res[i * N + j] = 0;
+      for (int k = 0; k < N; k++)
+        res[i * N + j] += a[i * N + k] * b[k * N + j];
+    }
+}
+
+// CUDA error checking
+#define ERR_CHECK(X) \
+    error_check(X, #X)
+void error_check(int res, char * call_str) {
+    if (res) {
+        printf("%s returned non-zero status %d\n", call_str, res);
+        exit(1);
+    }
+}
+
+// Read a PTX image containing a kernel from stdin
+void read_module(char * buf) {
+    printf("Reading kernel from standard input...\n");
+
+    char c;
+    while ((c = getchar()) != EOF) {
+        *(buf++) = c;
+    }
+    *buf = '\0';
+}
+
+int main (int argc, char ** argv) {
+    int devCount;
+    CUdevice device;
+    CUcontext context;
+    CUmodule module;
+    char kernel_ptx[5000];
+    CUfunction kernel_func;
+
+    // CUDA initialization and context creation
+    ERR_CHECK(cuInit(0));
+    ERR_CHECK(cuDeviceGetCount(&devCount));
+    ERR_CHECK(cuDeviceGet(&device, 0));
+    ERR_CHECK(cuCtxCreate(&context, 0, device));
+
+    // Load the kernel image and get a handle to the kernel function
+    read_module(kernel_ptx);
+    ERR_CHECK(cuModuleLoadData(&module, kernel_ptx));
+    ERR_CHECK(cuModuleGetFunction(&kernel_func, module, "kernel"));
+
+    // Allocate input and result
+    int N = 256;
+    int len = N * N;
+    float
+        *a = malloc(sizeof(float) * len),
+        *b = malloc(sizeof(float) * len),
+        *res_device = malloc(sizeof(float) * len),
+        *res_host = malloc(sizeof(float) * len);
+
+    for (int i = 0; i < len; i++) {
+        a[i] = rand();
+        b[i] = rand();
+    }
+
+    // Run the reference implementation
+    ref_kernel(a, b, res_host, N);
+
+    // Copy data to the device
+    CUdeviceptr dev_a, dev_b, dev_res;
+    ERR_CHECK(cuMemAlloc(&dev_a, sizeof(float) * len));
+    ERR_CHECK(cuMemAlloc(&dev_b, sizeof(float) * len));
+    ERR_CHECK(cuMemAlloc(&dev_res, sizeof(float) * len));
+    ERR_CHECK(cuMemcpyHtoD(dev_a, a, sizeof(float) * len));
+    ERR_CHECK(cuMemcpyHtoD(dev_b, b, sizeof(float) * len));
+
+    // Launch the kernel and wait
+    void * KernelParams [] = { &dev_a, &dev_b, &dev_res, &N };
+    int BlockSize = 32;
+    int GridSize = (N + BlockSize - 1) / BlockSize;
+    ERR_CHECK(cuLaunchKernel(kernel_func,
+                             // Grid sizes X, Y, Z
+                             GridSize, GridSize, 1,
+                             // Block sizes X, Y, Z
+                             BlockSize, BlockSize, 1,
+                             // shared mem size, stream id, kernel params, extra options
+                             0, NULL, KernelParams, NULL));
+    ERR_CHECK(cuCtxSynchronize());
+
+    // Retrieve and verify results
+    ERR_CHECK(cuMemcpyDtoH(res_device, dev_res, sizeof(float) * len));
+    float max_err = 0.0;
+    for (int i = 0; i < len; i++)
+      max_err = fmax(max_err,
+                     fabs(res_host[i] - res_device[i]));
+    printf("Max error: %f\n", max_err);
+
+    // Cleanup
+    free(a);
+    free(b);
+    free(res_device);
+    free(res_host);
+    ERR_CHECK(cuMemFree(dev_a));
+    ERR_CHECK(cuMemFree(dev_b));
+    ERR_CHECK(cuMemFree(dev_res));
+    ERR_CHECK(cuModuleUnload(module));
+    ERR_CHECK(cuCtxDestroy(context));
+}
diff --git a/src/backend/tests/kernelprog/matmul/matmul.sexp b/src/backend/tests/kernelprog/matmul/matmul.sexp
@@ -0,0 +1,47 @@
+(c-lisp
+        ; Thread Index
+        (define ((llvm.nvvm.read.ptx.sreg.tid.x int)))
+        (define ((llvm.nvvm.read.ptx.sreg.tid.y int)))
+        ; Block Index
+        (define ((llvm.nvvm.read.ptx.sreg.ctaid.x int)))
+        (define ((llvm.nvvm.read.ptx.sreg.ctaid.y int)))
+        ; Block Dimensions
+        (define ((llvm.nvvm.read.ptx.sreg.ntid.x int)))
+        (define ((llvm.nvvm.read.ptx.sreg.ntid.y int)))
+
+        (define ((kernel void) (a (ptr float (addrspace 1))) (b (ptr float (addrspace 1))) (c (ptr float (addrspace 1))) (len int))
+                ; Calculate c[i, j] from a[i, *] and b[*, j]
+                ; a, b, c have dimensions len x len
+                (declare row int)
+                (declare col int)
+                (set row
+                     (add
+                        (call llvm.nvvm.read.ptx.sreg.tid.x)
+                        (mul (call llvm.nvvm.read.ptx.sreg.ntid.x) (call llvm.nvvm.read.ptx.sreg.ctaid.x))))
+                (set col
+                     (add
+                        (call llvm.nvvm.read.ptx.sreg.tid.y)
+                        (mul (call llvm.nvvm.read.ptx.sreg.ntid.y) (call llvm.nvvm.read.ptx.sreg.ctaid.y))))
+
+                (declare a-ptr (ptr float (addrspace 1)))
+                (declare b-ptr (ptr float (addrspace 1)))
+                (declare c-ptr (ptr float (addrspace 1)))
+                (set a-ptr (ptradd a (mul row len)))
+                (set b-ptr (ptradd b col))
+                (set c-ptr (ptradd c (add col (mul row len))))
+
+                (declare c-val float)
+                (set c-val 0.0)
+
+                (declare k int)
+                (for ((set k 0)
+                      (lt k len)
+                      (set k (add k 1)))
+                     (set c-val
+                          (fadd
+                             c-val
+                             (fmul (load a-ptr) (load b-ptr))))
+                     (set a-ptr (ptradd a-ptr 1))
+                     (set b-ptr (ptradd b-ptr len)))
+
+                (store c-ptr c-val)))
diff --git a/src/backend/tests/kernelprog/vecadd/Makefile b/src/backend/tests/kernelprog/vecadd/Makefile
@@ -0,0 +1,24 @@
+DATALAYOUT = target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\"
+TRIPLE = target triple = \"nvptx64-nvidia-cuda\"
+
+# C-Lisp kernel to LLVM
+%.ll: %.sexp
+	guile ../../../utils/sexp-json.scm < $< | python ../../../c-lisp.py | python ../../../brilisp.py | python ../../../llvm.py > $@
+	sed -e "s/^target datalayout.*/$(DATALAYOUT)/" -i $@
+	sed -e "s/^target triple.*/$(TRIPLE)/" -i $@
+	echo '!nvvm.annotations = !{!0}' >> $@
+	echo '!0 = !{void (float*, float*, float*)* @kernel, !"kernel", i32 1}' >> $@
+
+# LLVM kernel to PTX
+%.ptx: %.ll
+	llc -mcpu=sm_75 -O0 -o $@ $<
+
+# Driver/host-side control code
+%.driver: %.driver.c
+	clang -o $@ -I/usr/local/cuda/include -lcuda $<
+
+%.run: %.driver %.ptx
+	LD_LIBRARY_PATH="/lib/x86_64-linux-gnu/" ./$(word 1, $^) < $(word 2, $^)
+
+clean:
+	rm -rf *.ll *.ptx driver
diff --git a/src/backend/tests/kernelprog/vecadd/vecadd.driver.c b/src/backend/tests/kernelprog/vecadd/vecadd.driver.c
@@ -0,0 +1,110 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <math.h>
+
+/* Host-side code to launch an arbitrary kernel function and
+   validate the results against those of a reference function */
+
+// Reference implementation, defined per kernel
+void ref_kernel(float * a, float * b, float * res, int N) {
+    for (int i = 0; i < N; i++)
+        res[i] = a[i] + b[i];
+}
+
+// CUDA error checking
+#define ERR_CHECK(X) \
+    error_check(X, #X)
+void error_check(int res, char * call_str) {
+    if (res) {
+        printf("%s returned non-zero status %d\n", call_str, res);
+        exit(1);
+    }
+}
+
+// Read a PTX image containing a kernel from stdin
+void read_module(char * buf) {
+    printf("Reading kernel from standard input...\n");
+
+    char c;
+    while ((c = getchar()) != EOF) {
+        *(buf++) = c;
+    }
+    *buf = '\0';
+}
+
+int main (int argc, char ** argv) {
+    int devCount;
+    CUdevice device;
+    CUcontext context;
+    CUmodule module;
+    char kernel_ptx[4000];
+    CUfunction kernel_func;
+
+    // CUDA initialization and context creation
+    ERR_CHECK(cuInit(0));
+    ERR_CHECK(cuDeviceGetCount(&devCount));
+    ERR_CHECK(cuDeviceGet(&device, 0));
+    ERR_CHECK(cuCtxCreate(&context, 0, device));
+
+    // Load the kernel image and get a handle to the kernel function
+    read_module(kernel_ptx);
+    ERR_CHECK(cuModuleLoadData(&module, kernel_ptx));
+    ERR_CHECK(cuModuleGetFunction(&kernel_func, module, "kernel"));
+
+    // Allocate input and result
+    int N = 1024;
+    float
+        *a = malloc(sizeof(float) * N),
+        *b = malloc(sizeof(float) * N),
+        *res_device = malloc(sizeof(float) * N),
+        *res_host = malloc(sizeof(float) * N);
+
+    for (int i = 0; i < N; i++) {
+        a[i] = rand();
+        b[i] = rand();
+    }
+
+    // Run the reference implementation
+    ref_kernel(a, b, res_host, N);
+
+    // Copy data to the device
+    CUdeviceptr dev_a, dev_b, dev_res;
+    ERR_CHECK(cuMemAlloc(&dev_a, sizeof(float) * N));
+    ERR_CHECK(cuMemAlloc(&dev_b, sizeof(float) * N));
+    ERR_CHECK(cuMemAlloc(&dev_res, sizeof(float) * N));
+    ERR_CHECK(cuMemcpyHtoD(dev_a, a, sizeof(float)*N));
+    ERR_CHECK(cuMemcpyHtoD(dev_b, b, sizeof(float)*N));
+
+    // Launch the kernel and wait
+    void * KernelParams [] = { &dev_a, &dev_b, &dev_res };
+    int BlockSize = 32;
+    int GridSize = (N + BlockSize - 1) / BlockSize;
+    ERR_CHECK(cuLaunchKernel(kernel_func,
+                             // Grid sizes X, Y, Z
+                             GridSize, 1, 1,
+                             // Block sizes X, Y, Z
+                             BlockSize, 1, 1,
+                             // shared mem size, stream id, kernel params, extra options
+                             0, NULL, KernelParams, NULL));
+    ERR_CHECK(cuCtxSynchronize());
+
+    // Retrieve and verify results
+    ERR_CHECK(cuMemcpyDtoH(res_device, dev_res, sizeof(float) * N));
+    float max_err = 0.0;
+    for (int i = 0; i < N; i++)
+      max_err = fmax(max_err,
+                     fabs(res_host[i] - res_device[i]));
+    printf("Max error: %f\n", max_err);
+
+    // Cleanup
+    free(a);
+    free(b);
+    free(res_device);
+    free(res_host);
+    ERR_CHECK(cuMemFree(dev_a));
+    ERR_CHECK(cuMemFree(dev_b));
+    ERR_CHECK(cuMemFree(dev_res));
+    ERR_CHECK(cuModuleUnload(module));
+    ERR_CHECK(cuCtxDestroy(context));
+}