Skip to content

Commit

Permalink
Merge pull request #71 from GlowingScrewdriver/clisp-to-ptx
Browse files Browse the repository at this point in the history
Running C-Lisp code on Nvidia GPUs
  • Loading branch information
chsasank authored Jul 18, 2024
2 parents 7e70722 + 031313e commit 473274a
Show file tree
Hide file tree
Showing 11 changed files with 367 additions and 3 deletions.
7 changes: 6 additions & 1 deletion src/backend/brilisp.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ def gen_function(expr):

def gen_type(typ):
if is_list(typ):
return {typ[0]: gen_type(typ[1])}
assert typ[0] == "ptr"
retval = {"ptr": gen_type(typ[1])}
if len(typ) > 2 and is_list(typ[2]):
assert typ[2][0] == "addrspace"
retval["addrspace"] = typ[2][1]
return retval
else:
return typ

Expand Down
6 changes: 4 additions & 2 deletions src/backend/llvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def generate(self, bril_prog):
def gen_type(self, type):
if isinstance(type, dict):
if "ptr" in type:
return self.gen_type(type["ptr"]).as_pointer()
addrspace = type.get("addrspace", 0)
type_obj = self.gen_type(type["ptr"]).as_pointer(addrspace)
return type_obj
else:
raise CodegenError(f"Unknown type {type}")
elif type in ["int", "int32"]:
Expand Down Expand Up @@ -126,7 +128,7 @@ def gen_instructions(self, instrs):
"sitofp": "sitofp",
"ptrtoint": "ptrtoint",
"inttoptr": "inttoptr",
"bitcast": "bitcast"
"bitcast": "bitcast",
}

def gen_label(instr):
Expand Down
19 changes: 19 additions & 0 deletions src/backend/tests/brilisp/addrspace.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
; ModuleID = ""
target triple = "unknown-unknown-unknown"
target datalayout = ""

define void @"add5"(float addrspace(1)* %"a")
{
alloca:
%"a.1" = alloca float addrspace(1)*
%"five" = alloca float
br label %"entry"
entry:
store float addrspace(1)* %"a", float addrspace(1)** %"a.1"
store float 0x4014000000000000, float* %"five"
%".5" = load float addrspace(1)*, float addrspace(1)** %"a.1"
%".6" = load float, float* %"five"
store float %".6", float addrspace(1)* %".5"
ret void
}

7 changes: 7 additions & 0 deletions src/backend/tests/brilisp/addrspace.sexp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
;; CMD: guile ../../utils/sexp-json.scm < {filename} | python ../../brilisp.py | python ../../llvm.py | sed -e 's/alloca-[a-z]*/alloca/' -e 's/entry-[a-z]*/entry/'

(brilisp
(define ((add5 void) (a (ptr float (addrspace 1))))
(set (five float) (const 5.0))
(store a five)
(ret)))
15 changes: 15 additions & 0 deletions src/backend/tests/kernelprog/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
## C-Lisp PTX Generation Proof-of-Concept

Each subdirectory here has a device-side kernel, stored as `<name>.sexp`, along with
a host-side driver, stored as `<name>.driver.c`. `<name>.driver.c` files contain code
to JIT-compile and launch a kernel, along with a reference implementation against which
results are compared.

To test a kernel, run `make <kernel name>.run` from the kernel's directory. This
will
* Compile the kernel from C-Lisp to PTX
* Compile the driver from C to an executable
* Run the driver, which in turn will
- Initialize random inputs
- JIT the kernel and launch it
- Run the reference implementation and compare results
1 change: 1 addition & 0 deletions src/backend/tests/kernelprog/matmul/Makefile
115 changes: 115 additions & 0 deletions src/backend/tests/kernelprog/matmul/matmul.driver.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <math.h>

/* Host-side code to launch the kernel function and
validate the results against those of a reference function */

// Reference implementation, defined per kernel
void ref_kernel(float * a, float * b, float * res, int N) {
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++) {
res[i * N + j] = 0;
for (int k = 0; k < N; k++)
res[i * N + j] += a[i * N + k] * b[k * N + j];
}
}

// CUDA error checking
#define ERR_CHECK(X) \
error_check(X, #X)
void error_check(int res, char * call_str) {
if (res) {
printf("%s returned non-zero status %d\n", call_str, res);
exit(1);
}
}

// Read a PTX image containing a kernel from stdin
void read_module(char * buf) {
printf("Reading kernel from standard input...\n");

char c;
while ((c = getchar()) != EOF) {
*(buf++) = c;
}
*buf = '\0';
}

int main (int argc, char ** argv) {
int devCount;
CUdevice device;
CUcontext context;
CUmodule module;
char kernel_ptx[5000];
CUfunction kernel_func;

// CUDA initialization and context creation
ERR_CHECK(cuInit(0));
ERR_CHECK(cuDeviceGetCount(&devCount));
ERR_CHECK(cuDeviceGet(&device, 0));
ERR_CHECK(cuCtxCreate(&context, 0, device));

// Load the kernel image and get a handle to the kernel function
read_module(kernel_ptx);
ERR_CHECK(cuModuleLoadData(&module, kernel_ptx));
ERR_CHECK(cuModuleGetFunction(&kernel_func, module, "kernel"));

// Allocate input and result
int N = 256;
int len = N * N;
float
*a = malloc(sizeof(float) * len),
*b = malloc(sizeof(float) * len),
*res_device = malloc(sizeof(float) * len),
*res_host = malloc(sizeof(float) * len);

for (int i = 0; i < len; i++) {
a[i] = rand();
b[i] = rand();
}

// Run the reference implementation
ref_kernel(a, b, res_host, N);

// Copy data to the device
CUdeviceptr dev_a, dev_b, dev_res;
ERR_CHECK(cuMemAlloc(&dev_a, sizeof(float) * len));
ERR_CHECK(cuMemAlloc(&dev_b, sizeof(float) * len));
ERR_CHECK(cuMemAlloc(&dev_res, sizeof(float) * len));
ERR_CHECK(cuMemcpyHtoD(dev_a, a, sizeof(float) * len));
ERR_CHECK(cuMemcpyHtoD(dev_b, b, sizeof(float) * len));

// Launch the kernel and wait
void * KernelParams [] = { &dev_a, &dev_b, &dev_res, &N };
int BlockSize = 32;
int GridSize = (N + BlockSize - 1) / BlockSize;
ERR_CHECK(cuLaunchKernel(kernel_func,
// Grid sizes X, Y, Z
GridSize, GridSize, 1,
// Block sizes X, Y, Z
BlockSize, BlockSize, 1,
// shared mem size, stream id, kernel params, extra options
0, NULL, KernelParams, NULL));
ERR_CHECK(cuCtxSynchronize());

// Retrieve and verify results
ERR_CHECK(cuMemcpyDtoH(res_device, dev_res, sizeof(float) * len));
float max_err = 0.0;
for (int i = 0; i < len; i++)
max_err = fmax(max_err,
fabs(res_host[i] - res_device[i]));
printf("Max error: %f\n", max_err);

// Cleanup
free(a);
free(b);
free(res_device);
free(res_host);
ERR_CHECK(cuMemFree(dev_a));
ERR_CHECK(cuMemFree(dev_b));
ERR_CHECK(cuMemFree(dev_res));
ERR_CHECK(cuModuleUnload(module));
ERR_CHECK(cuCtxDestroy(context));
}
47 changes: 47 additions & 0 deletions src/backend/tests/kernelprog/matmul/matmul.sexp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
(c-lisp
; Thread Index
(define ((llvm.nvvm.read.ptx.sreg.tid.x int)))
(define ((llvm.nvvm.read.ptx.sreg.tid.y int)))
; Block Index
(define ((llvm.nvvm.read.ptx.sreg.ctaid.x int)))
(define ((llvm.nvvm.read.ptx.sreg.ctaid.y int)))
; Block Dimensions
(define ((llvm.nvvm.read.ptx.sreg.ntid.x int)))
(define ((llvm.nvvm.read.ptx.sreg.ntid.y int)))

(define ((kernel void) (a (ptr float (addrspace 1))) (b (ptr float (addrspace 1))) (c (ptr float (addrspace 1))) (len int))
; Calculate c[i, j] from a[i, *] and b[*, j]
; a, b, c have dimensions len x len
(declare row int)
(declare col int)
(set row
(add
(call llvm.nvvm.read.ptx.sreg.tid.x)
(mul (call llvm.nvvm.read.ptx.sreg.ntid.x) (call llvm.nvvm.read.ptx.sreg.ctaid.x))))
(set col
(add
(call llvm.nvvm.read.ptx.sreg.tid.y)
(mul (call llvm.nvvm.read.ptx.sreg.ntid.y) (call llvm.nvvm.read.ptx.sreg.ctaid.y))))

(declare a-ptr (ptr float (addrspace 1)))
(declare b-ptr (ptr float (addrspace 1)))
(declare c-ptr (ptr float (addrspace 1)))
(set a-ptr (ptradd a (mul row len)))
(set b-ptr (ptradd b col))
(set c-ptr (ptradd c (add col (mul row len))))

(declare c-val float)
(set c-val 0.0)

(declare k int)
(for ((set k 0)
(lt k len)
(set k (add k 1)))
(set c-val
(fadd
c-val
(fmul (load a-ptr) (load b-ptr))))
(set a-ptr (ptradd a-ptr 1))
(set b-ptr (ptradd b-ptr len)))

(store c-ptr c-val)))
24 changes: 24 additions & 0 deletions src/backend/tests/kernelprog/vecadd/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
DATALAYOUT = target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\"
TRIPLE = target triple = \"nvptx64-nvidia-cuda\"

# C-Lisp kernel to LLVM
%.ll: %.sexp
guile ../../../utils/sexp-json.scm < $< | python ../../../c-lisp.py | python ../../../brilisp.py | python ../../../llvm.py > $@
sed -e "s/^target datalayout.*/$(DATALAYOUT)/" -i $@
sed -e "s/^target triple.*/$(TRIPLE)/" -i $@
echo '!nvvm.annotations = !{!0}' >> $@
echo '!0 = !{void (float*, float*, float*)* @kernel, !"kernel", i32 1}' >> $@

# LLVM kernel to PTX
%.ptx: %.ll
llc -mcpu=sm_75 -O0 -o $@ $<

# Driver/host-side control code
%.driver: %.driver.c
clang -o $@ -I/usr/local/cuda/include -lcuda $<

%.run: %.driver %.ptx
LD_LIBRARY_PATH="/lib/x86_64-linux-gnu/" ./$(word 1, $^) < $(word 2, $^)

clean:
rm -rf *.ll *.ptx driver
110 changes: 110 additions & 0 deletions src/backend/tests/kernelprog/vecadd/vecadd.driver.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <math.h>

/* Host-side code to launch an arbitrary kernel function and
validate the results against those of a reference function */

// Reference implementation, defined per kernel
void ref_kernel(float * a, float * b, float * res, int N) {
for (int i = 0; i < N; i++)
res[i] = a[i] + b[i];
}

// CUDA error checking
#define ERR_CHECK(X) \
error_check(X, #X)
void error_check(int res, char * call_str) {
if (res) {
printf("%s returned non-zero status %d\n", call_str, res);
exit(1);
}
}

// Read a PTX image containing a kernel from stdin
void read_module(char * buf) {
printf("Reading kernel from standard input...\n");

char c;
while ((c = getchar()) != EOF) {
*(buf++) = c;
}
*buf = '\0';
}

int main (int argc, char ** argv) {
int devCount;
CUdevice device;
CUcontext context;
CUmodule module;
char kernel_ptx[4000];
CUfunction kernel_func;

// CUDA initialization and context creation
ERR_CHECK(cuInit(0));
ERR_CHECK(cuDeviceGetCount(&devCount));
ERR_CHECK(cuDeviceGet(&device, 0));
ERR_CHECK(cuCtxCreate(&context, 0, device));

// Load the kernel image and get a handle to the kernel function
read_module(kernel_ptx);
ERR_CHECK(cuModuleLoadData(&module, kernel_ptx));
ERR_CHECK(cuModuleGetFunction(&kernel_func, module, "kernel"));

// Allocate input and result
int N = 1024;
float
*a = malloc(sizeof(float) * N),
*b = malloc(sizeof(float) * N),
*res_device = malloc(sizeof(float) * N),
*res_host = malloc(sizeof(float) * N);

for (int i = 0; i < N; i++) {
a[i] = rand();
b[i] = rand();
}

// Run the reference implementation
ref_kernel(a, b, res_host, N);

// Copy data to the device
CUdeviceptr dev_a, dev_b, dev_res;
ERR_CHECK(cuMemAlloc(&dev_a, sizeof(float) * N));
ERR_CHECK(cuMemAlloc(&dev_b, sizeof(float) * N));
ERR_CHECK(cuMemAlloc(&dev_res, sizeof(float) * N));
ERR_CHECK(cuMemcpyHtoD(dev_a, a, sizeof(float)*N));
ERR_CHECK(cuMemcpyHtoD(dev_b, b, sizeof(float)*N));

// Launch the kernel and wait
void * KernelParams [] = { &dev_a, &dev_b, &dev_res };
int BlockSize = 32;
int GridSize = (N + BlockSize - 1) / BlockSize;
ERR_CHECK(cuLaunchKernel(kernel_func,
// Grid sizes X, Y, Z
GridSize, 1, 1,
// Block sizes X, Y, Z
BlockSize, 1, 1,
// shared mem size, stream id, kernel params, extra options
0, NULL, KernelParams, NULL));
ERR_CHECK(cuCtxSynchronize());

// Retrieve and verify results
ERR_CHECK(cuMemcpyDtoH(res_device, dev_res, sizeof(float) * N));
float max_err = 0.0;
for (int i = 0; i < N; i++)
max_err = fmax(max_err,
fabs(res_host[i] - res_device[i]));
printf("Max error: %f\n", max_err);

// Cleanup
free(a);
free(b);
free(res_device);
free(res_host);
ERR_CHECK(cuMemFree(dev_a));
ERR_CHECK(cuMemFree(dev_b));
ERR_CHECK(cuMemFree(dev_res));
ERR_CHECK(cuModuleUnload(module));
ERR_CHECK(cuCtxDestroy(context));
}
Loading

0 comments on commit 473274a

Please sign in to comment.