-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #71 from GlowingScrewdriver/clisp-to-ptx
Running C-Lisp code on Nvidia GPUs
- Loading branch information
Showing
11 changed files
with
367 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
; ModuleID = "" | ||
target triple = "unknown-unknown-unknown" | ||
target datalayout = "" | ||
|
||
define void @"add5"(float addrspace(1)* %"a") | ||
{ | ||
alloca: | ||
%"a.1" = alloca float addrspace(1)* | ||
%"five" = alloca float | ||
br label %"entry" | ||
entry: | ||
store float addrspace(1)* %"a", float addrspace(1)** %"a.1" | ||
store float 0x4014000000000000, float* %"five" | ||
%".5" = load float addrspace(1)*, float addrspace(1)** %"a.1" | ||
%".6" = load float, float* %"five" | ||
store float %".6", float addrspace(1)* %".5" | ||
ret void | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
;; CMD: guile ../../utils/sexp-json.scm < {filename} | python ../../brilisp.py | python ../../llvm.py | sed -e 's/alloca-[a-z]*/alloca/' -e 's/entry-[a-z]*/entry/' | ||
|
||
(brilisp | ||
(define ((add5 void) (a (ptr float (addrspace 1)))) | ||
(set (five float) (const 5.0)) | ||
(store a five) | ||
(ret))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
## C-Lisp PTX Generation Proof-of-Concept | ||
|
||
Each subdirectory here has a device-side kernel, stored as `<name>.sexp`, along with | ||
a host-side driver, stored as `<name>.driver.c`. `<name>.driver.c` files contain code | ||
to JIT-compile and launch a kernel, along with a reference implementation against which | ||
results are compared. | ||
|
||
To test a kernel, run `make <kernel name>.run` from the kernel's directory. This | ||
will | ||
* Compile the kernel from C-Lisp to PTX | ||
* Compile the driver from C to an executable | ||
* Run the driver, which in turn will | ||
- Initialize random inputs | ||
- JIT the kernel and launch it | ||
- Run the reference implementation and compare results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../vecadd/Makefile |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#include <stdlib.h> | ||
#include <stdio.h> | ||
#include <cuda.h> | ||
#include <math.h> | ||
|
||
/* Host-side code to launch the kernel function and | ||
validate the results against those of a reference function */ | ||
|
||
// Reference implementation, defined per kernel | ||
void ref_kernel(float * a, float * b, float * res, int N) { | ||
for (int i = 0; i < N; i++) | ||
for (int j = 0; j < N; j++) { | ||
res[i * N + j] = 0; | ||
for (int k = 0; k < N; k++) | ||
res[i * N + j] += a[i * N + k] * b[k * N + j]; | ||
} | ||
} | ||
|
||
// CUDA error checking | ||
#define ERR_CHECK(X) \ | ||
error_check(X, #X) | ||
void error_check(int res, char * call_str) { | ||
if (res) { | ||
printf("%s returned non-zero status %d\n", call_str, res); | ||
exit(1); | ||
} | ||
} | ||
|
||
// Read a PTX image containing a kernel from stdin | ||
void read_module(char * buf) { | ||
printf("Reading kernel from standard input...\n"); | ||
|
||
char c; | ||
while ((c = getchar()) != EOF) { | ||
*(buf++) = c; | ||
} | ||
*buf = '\0'; | ||
} | ||
|
||
int main (int argc, char ** argv) { | ||
int devCount; | ||
CUdevice device; | ||
CUcontext context; | ||
CUmodule module; | ||
char kernel_ptx[5000]; | ||
CUfunction kernel_func; | ||
|
||
// CUDA initialization and context creation | ||
ERR_CHECK(cuInit(0)); | ||
ERR_CHECK(cuDeviceGetCount(&devCount)); | ||
ERR_CHECK(cuDeviceGet(&device, 0)); | ||
ERR_CHECK(cuCtxCreate(&context, 0, device)); | ||
|
||
// Load the kernel image and get a handle to the kernel function | ||
read_module(kernel_ptx); | ||
ERR_CHECK(cuModuleLoadData(&module, kernel_ptx)); | ||
ERR_CHECK(cuModuleGetFunction(&kernel_func, module, "kernel")); | ||
|
||
// Allocate input and result | ||
int N = 256; | ||
int len = N * N; | ||
float | ||
*a = malloc(sizeof(float) * len), | ||
*b = malloc(sizeof(float) * len), | ||
*res_device = malloc(sizeof(float) * len), | ||
*res_host = malloc(sizeof(float) * len); | ||
|
||
for (int i = 0; i < len; i++) { | ||
a[i] = rand(); | ||
b[i] = rand(); | ||
} | ||
|
||
// Run the reference implementation | ||
ref_kernel(a, b, res_host, N); | ||
|
||
// Copy data to the device | ||
CUdeviceptr dev_a, dev_b, dev_res; | ||
ERR_CHECK(cuMemAlloc(&dev_a, sizeof(float) * len)); | ||
ERR_CHECK(cuMemAlloc(&dev_b, sizeof(float) * len)); | ||
ERR_CHECK(cuMemAlloc(&dev_res, sizeof(float) * len)); | ||
ERR_CHECK(cuMemcpyHtoD(dev_a, a, sizeof(float) * len)); | ||
ERR_CHECK(cuMemcpyHtoD(dev_b, b, sizeof(float) * len)); | ||
|
||
// Launch the kernel and wait | ||
void * KernelParams [] = { &dev_a, &dev_b, &dev_res, &N }; | ||
int BlockSize = 32; | ||
int GridSize = (N + BlockSize - 1) / BlockSize; | ||
ERR_CHECK(cuLaunchKernel(kernel_func, | ||
// Grid sizes X, Y, Z | ||
GridSize, GridSize, 1, | ||
// Block sizes X, Y, Z | ||
BlockSize, BlockSize, 1, | ||
// shared mem size, stream id, kernel params, extra options | ||
0, NULL, KernelParams, NULL)); | ||
ERR_CHECK(cuCtxSynchronize()); | ||
|
||
// Retrieve and verify results | ||
ERR_CHECK(cuMemcpyDtoH(res_device, dev_res, sizeof(float) * len)); | ||
float max_err = 0.0; | ||
for (int i = 0; i < len; i++) | ||
max_err = fmax(max_err, | ||
fabs(res_host[i] - res_device[i])); | ||
printf("Max error: %f\n", max_err); | ||
|
||
// Cleanup | ||
free(a); | ||
free(b); | ||
free(res_device); | ||
free(res_host); | ||
ERR_CHECK(cuMemFree(dev_a)); | ||
ERR_CHECK(cuMemFree(dev_b)); | ||
ERR_CHECK(cuMemFree(dev_res)); | ||
ERR_CHECK(cuModuleUnload(module)); | ||
ERR_CHECK(cuCtxDestroy(context)); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
(c-lisp | ||
; Thread Index | ||
(define ((llvm.nvvm.read.ptx.sreg.tid.x int))) | ||
(define ((llvm.nvvm.read.ptx.sreg.tid.y int))) | ||
; Block Index | ||
(define ((llvm.nvvm.read.ptx.sreg.ctaid.x int))) | ||
(define ((llvm.nvvm.read.ptx.sreg.ctaid.y int))) | ||
; Block Dimensions | ||
(define ((llvm.nvvm.read.ptx.sreg.ntid.x int))) | ||
(define ((llvm.nvvm.read.ptx.sreg.ntid.y int))) | ||
|
||
(define ((kernel void) (a (ptr float (addrspace 1))) (b (ptr float (addrspace 1))) (c (ptr float (addrspace 1))) (len int)) | ||
; Calculate c[i, j] from a[i, *] and b[*, j] | ||
; a, b, c have dimensions len x len | ||
(declare row int) | ||
(declare col int) | ||
(set row | ||
(add | ||
(call llvm.nvvm.read.ptx.sreg.tid.x) | ||
(mul (call llvm.nvvm.read.ptx.sreg.ntid.x) (call llvm.nvvm.read.ptx.sreg.ctaid.x)))) | ||
(set col | ||
(add | ||
(call llvm.nvvm.read.ptx.sreg.tid.y) | ||
(mul (call llvm.nvvm.read.ptx.sreg.ntid.y) (call llvm.nvvm.read.ptx.sreg.ctaid.y)))) | ||
|
||
(declare a-ptr (ptr float (addrspace 1))) | ||
(declare b-ptr (ptr float (addrspace 1))) | ||
(declare c-ptr (ptr float (addrspace 1))) | ||
(set a-ptr (ptradd a (mul row len))) | ||
(set b-ptr (ptradd b col)) | ||
(set c-ptr (ptradd c (add col (mul row len)))) | ||
|
||
(declare c-val float) | ||
(set c-val 0.0) | ||
|
||
(declare k int) | ||
(for ((set k 0) | ||
(lt k len) | ||
(set k (add k 1))) | ||
(set c-val | ||
(fadd | ||
c-val | ||
(fmul (load a-ptr) (load b-ptr)))) | ||
(set a-ptr (ptradd a-ptr 1)) | ||
(set b-ptr (ptradd b-ptr len))) | ||
|
||
(store c-ptr c-val))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
DATALAYOUT = target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\" | ||
TRIPLE = target triple = \"nvptx64-nvidia-cuda\" | ||
|
||
# C-Lisp kernel to LLVM | ||
%.ll: %.sexp | ||
guile ../../../utils/sexp-json.scm < $< | python ../../../c-lisp.py | python ../../../brilisp.py | python ../../../llvm.py > $@ | ||
sed -e "s/^target datalayout.*/$(DATALAYOUT)/" -i $@ | ||
sed -e "s/^target triple.*/$(TRIPLE)/" -i $@ | ||
echo '!nvvm.annotations = !{!0}' >> $@ | ||
echo '!0 = !{void (float*, float*, float*)* @kernel, !"kernel", i32 1}' >> $@ | ||
|
||
# LLVM kernel to PTX | ||
%.ptx: %.ll | ||
llc -mcpu=sm_75 -O0 -o $@ $< | ||
|
||
# Driver/host-side control code | ||
%.driver: %.driver.c | ||
clang -o $@ -I/usr/local/cuda/include -lcuda $< | ||
|
||
%.run: %.driver %.ptx | ||
LD_LIBRARY_PATH="/lib/x86_64-linux-gnu/" ./$(word 1, $^) < $(word 2, $^) | ||
|
||
clean: | ||
rm -rf *.ll *.ptx driver |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
#include <stdlib.h> | ||
#include <stdio.h> | ||
#include <cuda.h> | ||
#include <math.h> | ||
|
||
/* Host-side code to launch an arbitrary kernel function and | ||
validate the results against those of a reference function */ | ||
|
||
// Reference implementation, defined per kernel | ||
void ref_kernel(float * a, float * b, float * res, int N) { | ||
for (int i = 0; i < N; i++) | ||
res[i] = a[i] + b[i]; | ||
} | ||
|
||
// CUDA error checking | ||
#define ERR_CHECK(X) \ | ||
error_check(X, #X) | ||
void error_check(int res, char * call_str) { | ||
if (res) { | ||
printf("%s returned non-zero status %d\n", call_str, res); | ||
exit(1); | ||
} | ||
} | ||
|
||
// Read a PTX image containing a kernel from stdin | ||
void read_module(char * buf) { | ||
printf("Reading kernel from standard input...\n"); | ||
|
||
char c; | ||
while ((c = getchar()) != EOF) { | ||
*(buf++) = c; | ||
} | ||
*buf = '\0'; | ||
} | ||
|
||
int main (int argc, char ** argv) { | ||
int devCount; | ||
CUdevice device; | ||
CUcontext context; | ||
CUmodule module; | ||
char kernel_ptx[4000]; | ||
CUfunction kernel_func; | ||
|
||
// CUDA initialization and context creation | ||
ERR_CHECK(cuInit(0)); | ||
ERR_CHECK(cuDeviceGetCount(&devCount)); | ||
ERR_CHECK(cuDeviceGet(&device, 0)); | ||
ERR_CHECK(cuCtxCreate(&context, 0, device)); | ||
|
||
// Load the kernel image and get a handle to the kernel function | ||
read_module(kernel_ptx); | ||
ERR_CHECK(cuModuleLoadData(&module, kernel_ptx)); | ||
ERR_CHECK(cuModuleGetFunction(&kernel_func, module, "kernel")); | ||
|
||
// Allocate input and result | ||
int N = 1024; | ||
float | ||
*a = malloc(sizeof(float) * N), | ||
*b = malloc(sizeof(float) * N), | ||
*res_device = malloc(sizeof(float) * N), | ||
*res_host = malloc(sizeof(float) * N); | ||
|
||
for (int i = 0; i < N; i++) { | ||
a[i] = rand(); | ||
b[i] = rand(); | ||
} | ||
|
||
// Run the reference implementation | ||
ref_kernel(a, b, res_host, N); | ||
|
||
// Copy data to the device | ||
CUdeviceptr dev_a, dev_b, dev_res; | ||
ERR_CHECK(cuMemAlloc(&dev_a, sizeof(float) * N)); | ||
ERR_CHECK(cuMemAlloc(&dev_b, sizeof(float) * N)); | ||
ERR_CHECK(cuMemAlloc(&dev_res, sizeof(float) * N)); | ||
ERR_CHECK(cuMemcpyHtoD(dev_a, a, sizeof(float)*N)); | ||
ERR_CHECK(cuMemcpyHtoD(dev_b, b, sizeof(float)*N)); | ||
|
||
// Launch the kernel and wait | ||
void * KernelParams [] = { &dev_a, &dev_b, &dev_res }; | ||
int BlockSize = 32; | ||
int GridSize = (N + BlockSize - 1) / BlockSize; | ||
ERR_CHECK(cuLaunchKernel(kernel_func, | ||
// Grid sizes X, Y, Z | ||
GridSize, 1, 1, | ||
// Block sizes X, Y, Z | ||
BlockSize, 1, 1, | ||
// shared mem size, stream id, kernel params, extra options | ||
0, NULL, KernelParams, NULL)); | ||
ERR_CHECK(cuCtxSynchronize()); | ||
|
||
// Retrieve and verify results | ||
ERR_CHECK(cuMemcpyDtoH(res_device, dev_res, sizeof(float) * N)); | ||
float max_err = 0.0; | ||
for (int i = 0; i < N; i++) | ||
max_err = fmax(max_err, | ||
fabs(res_host[i] - res_device[i])); | ||
printf("Max error: %f\n", max_err); | ||
|
||
// Cleanup | ||
free(a); | ||
free(b); | ||
free(res_device); | ||
free(res_host); | ||
ERR_CHECK(cuMemFree(dev_a)); | ||
ERR_CHECK(cuMemFree(dev_b)); | ||
ERR_CHECK(cuMemFree(dev_res)); | ||
ERR_CHECK(cuModuleUnload(module)); | ||
ERR_CHECK(cuCtxDestroy(context)); | ||
} |
Oops, something went wrong.