Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/blocks/extract_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#define CUDA_KERNEL "kernel:cuda:auto"
#define CUDA_KERNEL_COOP "kernel:cuda:coop"
#define CUDA_KERNEL_COOP_COPY_OUT "kernel:cuda:coop:copy_out"

namespace block {

Expand Down
19 changes: 19 additions & 0 deletions samples/outputs.var_names/sample37
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,27 @@ void __global__ cuda_kernel_0 (int* arg0) {
arg0[thread_id_2] = 0;
}

void __global__ cuda_kernel_1 (int* arg0) {
int thread_id_5 = (blockIdx.x * 512) + threadIdx.x;
arg0[thread_id_5] = 0;
}

char ret_2_0[sizeof(int*)] __device__;
void __global__ cuda_kernel_2 (int* arg0) {
int thread_id_8 = (blockIdx.x * 512) + threadIdx.x;
arg0[thread_id_8] = 0;
if (!(blockIdx.x * blockDim.x + threadIdx.x)) {
runtime::cudaMemcpyToSymbolMagic(ret_2_0, arg0);
}
}

void bar (int* arg0) {
cuda_kernel_0<<<128, 512>>>(arg0);
cudaDeviceSynchronize();
runtime::LaunchCooperativeKernel((void*)cuda_kernel_1, 128, 512, arg0);
cudaDeviceSynchronize();
runtime::LaunchCooperativeKernel((void*)cuda_kernel_2, 128, 512, arg0);
cudaDeviceSynchronize();
runtime::cudaMemcpyFromSymbolMagic((&(arg0)), ret_2_0);
}

19 changes: 19 additions & 0 deletions samples/outputs/sample37
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,27 @@ void __global__ cuda_kernel_0 (int* arg0) {
arg0[var2] = 0;
}

void __global__ cuda_kernel_1 (int* arg0) {
int var5 = (blockIdx.x * 512) + threadIdx.x;
arg0[var5] = 0;
}

char ret_2_0[sizeof(int*)] __device__;
void __global__ cuda_kernel_2 (int* arg0) {
int var8 = (blockIdx.x * 512) + threadIdx.x;
arg0[var8] = 0;
if (!(blockIdx.x * blockDim.x + threadIdx.x)) {
runtime::cudaMemcpyToSymbolMagic(ret_2_0, arg0);
}
}

void bar (int* arg0) {
cuda_kernel_0<<<128, 512>>>(arg0);
cudaDeviceSynchronize();
runtime::LaunchCooperativeKernel((void*)cuda_kernel_1, 128, 512, arg0);
cudaDeviceSynchronize();
runtime::LaunchCooperativeKernel((void*)cuda_kernel_2, 128, 512, arg0);
cudaDeviceSynchronize();
runtime::cudaMemcpyFromSymbolMagic((&(arg0)), ret_2_0);
}

14 changes: 14 additions & 0 deletions samples/sample37.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,20 @@ static void bar(dyn_var<int *> buffer) {
buffer[thread_id] = 0;
}
}
builder::annotate(CUDA_KERNEL_COOP);
for (dyn_var<int> cta = 0; cta < 128; cta = cta + 1) {
for (dyn_var<int> tid = 0; tid < 512; tid = tid + 1) {
dyn_var<int> thread_id = cta * 512 + tid;
buffer[thread_id] = 0;
}
}
builder::annotate(CUDA_KERNEL_COOP_COPY_OUT);
for (dyn_var<int> cta = 0; cta < 128; cta = cta + 1) {
for (dyn_var<int> tid = 0; tid < 512; tid = tid + 1) {
dyn_var<int> thread_id = cta * 512 + tid;
buffer[thread_id] = 0;
}
}
}

int main(int argc, char *argv[]) {
Expand Down
11 changes: 8 additions & 3 deletions src/blocks/extract_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,16 @@ block::Ptr extract_single_cuda(block::Ptr from, std::vector<decl_stmt::Ptr> &new
}

int is_coop = 0;
int is_copy_out = 0;
stmt::Ptr found_loop = annotation_finder::find_annotation(from, CUDA_KERNEL);
if (found_loop == nullptr) {
found_loop = annotation_finder::find_annotation(from, CUDA_KERNEL_COOP);
if (found_loop == nullptr) {
return nullptr;
found_loop = annotation_finder::find_annotation(from, CUDA_KERNEL_COOP_COPY_OUT);
if (found_loop == nullptr) {
return nullptr;
}
is_copy_out = 1;
}
is_coop = 1;
}
Expand Down Expand Up @@ -92,7 +97,7 @@ block::Ptr extract_single_cuda(block::Ptr from, std::vector<decl_stmt::Ptr> &new
int this_kern_index = total_created_kernels;
total_created_kernels++;
std::vector<var::Ptr> ret_vars;
if (is_coop) {
if (is_coop && is_copy_out) {
// If this is coop, we will create some extra decls to return the copied values
int i = 0;
for (auto v : vars) {
Expand Down Expand Up @@ -210,7 +215,7 @@ block::Ptr extract_single_cuda(block::Ptr from, std::vector<decl_stmt::Ptr> &new

// If this is a coop kernel, return the values
std::vector<stmt::Ptr> copy_backs;
if (is_coop) {
if (is_coop && is_copy_out) {
auto if_s = std::make_shared<if_stmt>();
auto nvar = std::make_shared<var>();
nvar->var_type = builder::dyn_var<int>::create_block_type();
Expand Down