Skip to content

Commit

Permalink
pass compile, fixed the use of constant memory, sigsegv on cudaFree???
Browse files Browse the repository at this point in the history
  • Loading branch information
Tonglin Chen committed Apr 22, 2019
1 parent 7376542 commit f517c36
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 44 deletions.
53 changes: 27 additions & 26 deletions groupby_hash.cu
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <thrust/device_ptr.h>

#include "cpuGroupby.h"
#include "groupby_hash_templates.cu"

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#include "groupby_hash.cuh"

// is there dynamic size constant memory?
__constant__ reductionType ops_d[512];
__constant__ reductionType ops_c[512];


#include "groupby_hash_templates.cu"


void groupby_hash_GPU(const int* key_columns_h, int num_key_columns, int num_key_rows,
const int* value_columns_h, int num_value_columns, int num_value_rows,
reductionType* ops, int num_ops, int* output_keys, int* output_values, int &num_output_rows)
{
constexpr unsigned int BLOCKDIM = 1024;
constexpr unsigned int HASH_TABLE_SIZE = 1003;
constexpr unsigned int GRIDDIM = 40; // 40 as GTX 1080 only have 20 SMs and it can schedule 2048 threads
// change to 56*2 = 112 if testing on Tesla P100

// variableAllocating
int* key_columns_d = NULL;
int* value_columns_d = NULL;
Expand All @@ -39,40 +39,40 @@ void groupby_hash_GPU(const int* key_columns_h, int num_key_columns, int num_key
// initialize values
gpuErrchk(cudaMemcpy(key_columns_d, key_columns_h, sizeof(int)*num_key_columns*num_key_rows, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(value_columns_d, value_columns_h, sizeof(int)*num_value_columns*num_value_rows, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(ops_d, ops, sizeof(reductionType) * num_ops));
initializeVariable<int><<<50, BLOCKDIM>>>(hash_key_idx_d, hash_count_d, hash_results_d, HASH_TABLE_SIZE, ops_d, num_ops);
gpuErrchk(cudaMemcpyToSymbol(ops_c, ops, sizeof(reductionType) * num_ops));
initializeVariable<int><<<GRIDDIM, BLOCKDIM>>>(hash_key_idx_d, hash_count_d, hash_results_d, HASH_TABLE_SIZE, num_ops);
gpuErrchk(cudaDeviceSynchronize());

// fill hash table
fillTable<int, int><<<50, BLOCKDIM>>>(key_columns_d, num_key_rows, num_key_columns,
value_columns_d, num_value_rows, num_value_columns,
hash_key_idx_d, hash_count_d, hash_results_d,
HASH_TABLE_SIZE, ops_d, num_ops);
fillTable<int, int><<<GRIDDIM, BLOCKDIM>>>(key_columns_d, num_key_rows, num_key_columns,
value_columns_d, num_value_rows, num_value_columns,
hash_key_idx_d, hash_count_d, hash_results_d,
HASH_TABLE_SIZE, num_ops);
gpuErrchk(cudaDeviceSynchronize());

// shrink the hash table to output array
//shrink the hash table to output array
//Create array of idices for hash table
int * seq, hashTable_idxs;
int *seq, *hashTable_idxs;
cudaMalloc((void**)&seq, HASH_TABLE_SIZE*sizeof(int)); //for hash index sequence
cudaMalloc((void**)&key_idxs, HASH_TABLE_SIZE*sizeof(int)); //for key indexs without -1
cudaMalloc((void**)&hashTable_idxs, HASH_TABLE_SIZE*sizeof(int)); //for key indexs without -1
thrust::device_ptr<int> hash_d_seq = thrust::device_pointer_cast(seq); //for hash index sequence
thrust::device_ptr<int> hashTable_idxs_d = thrust::device_pointer_cast(hashTable_idxs); //for key indexs without -1
thrust::sequence(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE; //fill hash index seq
thrust::sequence(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE); //fill hash index seq


//copy hash idex of keys, removeing -1's which signify not used
// copy_if(policy, index seq start, index seq end, hash keys for comparison, result containing idx to non -1's, comparator)
int *new_end = thrust::copy_if(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE, hash_key_idx_d, hashTable_idxs_d, is_not_neg_1());
auto newEnd = thrust::copy_if(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE, hash_key_idx_d, hashTable_idxs_d, is_not_neg_1());

num_output_rows = newEnd - hashTable_idxs_d;

int* output_key_columns_d = NULL;
cudaMalloc(&output_key_columns_d, sizeof(int)*num_key_columns*num_output_rows);
copyUnique<int><<<50,BLOCKDIM>>>(hashTable_idxs_d, hash_key_idx_d,key_columns_d, output_key_columns_d, num_output_rows, num_key_columns, num_key_rows);
copyUnique<int><<<GRIDDIM,BLOCKDIM>>>(hashTable_idxs, hash_key_idx_d,key_columns_d, output_key_columns_d, num_output_rows, num_key_columns, num_key_rows);

int* output_value_columns_d = NULL;
cudaMalloc(&output_value_columns_d, sizeof(int)*num_value_columns*num_output_rows);
copyValues<int><<<50,BLOCKDIM>>>(hashTable_idxs_d, hash_results_d,hash_count_d, value_columns_d, output_value_columns_d, num_output_rows, num_value_columns, num_value_rows, ops_d, num_ops, HASH_TABLE_SIZE);
copyValues<int><<<GRIDDIM,BLOCKDIM>>>(hashTable_idxs, hash_results_d,hash_count_d, value_columns_d, output_value_columns_d, num_output_rows, num_value_columns, num_value_rows, num_ops, HASH_TABLE_SIZE);

gpuErrchk(cudaDeviceSynchronize());

Expand All @@ -91,6 +91,7 @@ void groupby_hash_GPU(const int* key_columns_h, int num_key_columns, int num_key
cudaFree(hash_results_d);
cudaFree(output_key_columns_d);
cudaFree(output_value_columns_d);
cudaFree(key_idxs);
cudaFree(seq);
cudaFree(hashTable_idxs);

}
19 changes: 19 additions & 0 deletions groupby_hash.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#ifndef GROUPBY_HASH_CUH
#define GROUPBY_HASH_CUH

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

void groupby_hash_GPU(const int* key_columns_h, int num_key_columns, int num_key_rows,
const int* value_columns_h, int num_value_columns, int num_value_rows,
reductionType* ops, int num_ops, int* output_keys, int* output_values, int &num_output_rows);


#endif
30 changes: 13 additions & 17 deletions groupby_hash_templates.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ size_t HashKey(size_t idx) {
template <typename Tval> __device__
void updateEntry(Tval* value_columns,
size_t num_val_rows,
reductionType* ops,
size_t num_ops,
size_t idx,
size_t hashPos,
Expand All @@ -47,7 +46,7 @@ void updateEntry(Tval* value_columns,
for (size_t i = 0; i < num_ops; ++i) {
Tval value = value_columns[i * num_val_rows + idx];
size_t val_idx = i * len_hash_table + hashPos;
switch(ops[i]) {
switch(ops_c[i]) {
case rmin:
atomicMin(&(hash_results[val_idx]), value);
break;
Expand Down Expand Up @@ -77,7 +76,6 @@ void fillTable(Tkey* key_columns,
int* hash_count,
Tval* hash_results,
size_t len_hash_table,
reductionType* ops,
size_t num_ops)
{
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
Expand Down Expand Up @@ -106,7 +104,7 @@ void fillTable(Tkey* key_columns,
}
// now it is safe to update the entry
isInserted = true;
updateEntry<Tval>(value_columns, num_val_rows, ops, num_ops, i, curPos, hash_results, &(hash_count[curPos]), len_hash_table);
updateEntry<Tval>(value_columns, num_val_rows, num_ops, i, curPos, hash_results, &(hash_count[curPos]), len_hash_table);
}
if (!isInserted) {
// Do sth in the case of overflowing hash table
Expand All @@ -119,7 +117,6 @@ void initializeVariable(int* hash_key_idx,
int* hash_count,
Tval* hash_results,
size_t len_hash_table,
reductionType* ops,
size_t num_ops)
{
// each thread responsible for one entry (with thread coarsening)
Expand All @@ -130,9 +127,9 @@ void initializeVariable(int* hash_key_idx,
hash_count[i] = 0;
for (size_t j = 0; j < num_ops; ++j) {
// replace following with specialized limit template in the future
if (ops[i] == rmin) {
if (ops_c[j] == rmin) {
hash_results[j * len_hash_table + i] = cuda_custom::limits<Tval>::max();
} else if (ops[i] == rmax) {
} else if (ops_c[j] == rmax) {
hash_results[j * len_hash_table + i] = cuda_custom::limits<Tval>::lowest();
} else {
hash_results[j * len_hash_table + i] = 0;
Expand All @@ -149,12 +146,12 @@ void copyUnique(
Tval* key_columns_d,
Tval* output_key_columns_d,
int num_output_rows,
int num_key_columns.
int num_key_columns,
int num_key_rows)
{
idx = threadIdx.x + blockIdx.x * blockDim.x;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while (idx < num_output_rows){
for (int i = 0; i < num_key_columns, i++){//each column of key matrix
for (int i = 0; i < num_key_columns; i++){//each column of key matrix
output_key_columns_d[idx+num_output_rows*i] = key_columns_d[hash_key_idx_d[hashTable_idxs_d[idx]]+num_key_rows*i];//copy original key entry to output
}
idx += gridDim.x*blockDim.x;//increment idx by thread space
Expand All @@ -169,18 +166,17 @@ void copyValues(
Tval* value_columns_d,
Tval* output_value_columns_d,
int num_output_rows,
int num_value_columns.
int num_value_columns,
int num_value_rows,
reductionType* ops,
size_t num_ops,
size_t len_hash_table,
size_t len_hash_table
)
{
idx = threadIdx.x + blockIdx.x * blockDim.x;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while (idx < num_output_rows){
for (size_t i = 0; i < num_ops; ++i) {
size_t val_idx = i * len_hash_table + hashTable_idxs_d[idx];
switch(ops[i]) {
switch(ops_c[i]) {
case rmin:
output_value_columns_d[idx+num_output_rows*i] = hash_results_d[val_idx];//copy result to output
break;
Expand Down Expand Up @@ -208,6 +204,6 @@ struct is_not_neg_1
__host__ __device__
bool operator()(const int x)
{
return x != -1s;
return x != -1;
}
};
};
81 changes: 81 additions & 0 deletions main_hash.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#include <iostream>
#include <algorithm>
#include <chrono>
#include <vector>
#include <string>
#include "cpuGroupby.h"
#include "groupby_hash.cuh"

int main(int argc, const char * argv[]) {
using Time = std::chrono::high_resolution_clock;
using fsec = std::chrono::duration<float>;

int num_rows = 100000;
int num_key_cols = 2;
int num_val_cols = 3;
int num_distinct_keys = 10;
std::vector<std::string> args(argv, argv+argc);
if (argc == 2){
num_rows = stoi(args.at(1));
} else if(argc == 4){
num_rows = stoi(args.at(1));
num_key_cols = stoi(args.at(2));
num_val_cols = stoi(args.at(3));
} else if(argc == 5){
num_rows = stoi(args.at(1));
num_key_cols = stoi(args.at(2));
num_val_cols = stoi(args.at(3));
num_distinct_keys = stoi(args.at(4));
} else {
if (argc != 1) {
std::cerr << "Invalid arguments" << std::endl;
exit(1);
}
}
// Setting up the CPU groupby
cpuGroupby slowGroupby(num_key_cols, num_val_cols, num_rows);

slowGroupby.fillRand(num_distinct_keys, num_rows);

int *original_key_columns;
cudaMallocHost((void**)&original_key_columns, sizeof(int)*num_key_cols*num_rows);
int *original_value_columns;
cudaMallocHost((void**)&original_value_columns, sizeof(int)*num_val_cols*num_rows);
std::copy(slowGroupby.key_columns, slowGroupby.key_columns + num_key_cols*num_rows, original_key_columns);
std::copy(slowGroupby.value_columns, slowGroupby.value_columns + num_val_cols*num_rows, original_value_columns);

auto start = Time::now();

slowGroupby.groupby();

auto end = Time::now();
fsec cpu_duration = end - start;

// Insert GPU function calls here...
int *gpu_output_keys, *gpu_output_values;
int gpu_output_rows = 0;
gpu_output_keys = new int[slowGroupby.num_key_rows*slowGroupby.num_key_columns];
gpu_output_values = new int[slowGroupby.num_value_rows*slowGroupby.num_value_columns];

start = Time::now();

groupby_hash_GPU(original_key_columns, slowGroupby.num_key_columns,
slowGroupby.num_key_rows, original_value_columns,
slowGroupby.num_value_columns, slowGroupby.num_value_rows,
slowGroupby.ops, slowGroupby.num_ops,
gpu_output_keys, gpu_output_values, gpu_output_rows);
end = Time::now();

slowGroupby.printGPUResults(gpu_output_keys, gpu_output_values);

fsec gpu_duration = end - start;

std::cout << "CPU time: " << cpu_duration.count() << " s" << std::endl;
std::cout << "GPU time: " << gpu_duration.count() << " s" << std::endl;

slowGroupby.validGPUResult(gpu_output_keys, gpu_output_values, gpu_output_rows);

cudaFreeHost(original_value_columns);
cudaFreeHost(original_key_columns);
return 0;
}
9 changes: 8 additions & 1 deletion makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,25 @@ endif

LD_FLAGS = -lcudart -L/usr/local/cuda/lib64
EXE = groupby
EXE_HASH = groupby_hash
OBJ = main.o cpuGroupby.o groupby.o HashFunc.o
OBJ_HASH = main_hash.o cpuGroupby.o groupby_hash.o

default: $(EXE)

main.o: main.cu cpuGroupby.h groupby.cu
$(NVCC) -c -o $@ main.cu $(NVCC_FLAGS) $(CXX_FLAGS)

main_hash.o: main_hash.cu cpuGroupby.h groupby_hash.cuh
$(NVCC) -c -o $@ main_hash.cu $(NVCC_FLAGS) $(CXX_FLAGS)

HashFunc.o: HashFunc.cu HashFunc.cuh
$(NVCC) -c -o $@ HashFunc.cu $(NVCC_FLAGS)

groupby.o: groupby.cu
$(NVCC) -c -o $@ groupby.cu $(NVCC_FLAGS)

groupby_hash.o: groupby_hash.cu groupby_hash_templates.cu limits.cuh
groupby_hash.o: groupby_hash.cu groupby_hash_templates.cu limits.cuh groupby_hash.cuh
$(NVCC) -c -o $@ groupby_hash.cu $(NVCC_FLAGS) $(CXX_FLAGS)

cpuGroupby.o: cpuGroupby.cpp cpuGroupby.h
Expand All @@ -34,5 +39,7 @@ cpuGroupby.o: cpuGroupby.cpp cpuGroupby.h
$(EXE): $(OBJ)
$(NVCC) $(OBJ) -o $(EXE) $(LD_FLAGS) $(NVCC_FLAGS)

$(EXE_HASH): $(OBJ_HASH)
$(NVCC) $(OBJ_HASH) -o $(EXE_HASH) $(LD_FLAGS) $(NVCC_FLAGS) $(CXX_FLAG)
clean:
rm -rf *.o $(EXE)

0 comments on commit f517c36

Please sign in to comment.