pass compile, fixed the use of constant memory, sigsegv on cudaFree???

chrispypatt · Apr 22, 2019 · f517c36 · f517c36
1 parent 7376542
commit f517c36
Show file tree

Hide file tree

Showing 5 changed files with 148 additions and 44 deletions.
diff --git a/groupby_hash.cu b/groupby_hash.cu
@@ -1,28 +1,28 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/device_ptr.h>
 
 #include "cpuGroupby.h"
-#include "groupby_hash_templates.cu"
-
-#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-   if (code != cudaSuccess) 
-   {
-      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
-      if (abort) exit(code);
-   }
-} 
+#include "groupby_hash.cuh"
 
 // is there dynamic size constant memory?
-__constant__ reductionType ops_d[512];
+__constant__ reductionType ops_c[512];
+
+
+#include "groupby_hash_templates.cu"
+
 
 void groupby_hash_GPU(const int* key_columns_h, int num_key_columns, int num_key_rows,
 		      const int* value_columns_h, int num_value_columns, int num_value_rows,
 		      reductionType* ops, int num_ops, int* output_keys, int* output_values, int &num_output_rows)
 {
   constexpr unsigned int BLOCKDIM = 1024;
   constexpr unsigned int HASH_TABLE_SIZE = 1003;
+  constexpr unsigned int GRIDDIM = 40; // 40 as GTX 1080 only have 20 SMs and it can schedule 2048 threads
+                                       // change to 56*2 = 112 if testing on Tesla P100
+
   // variableAllocating
   int* key_columns_d = NULL;
   int* value_columns_d = NULL;
@@ -39,40 +39,40 @@ void groupby_hash_GPU(const int* key_columns_h, int num_key_columns, int num_key
   // initialize values
   gpuErrchk(cudaMemcpy(key_columns_d, key_columns_h, sizeof(int)*num_key_columns*num_key_rows, cudaMemcpyHostToDevice));
   gpuErrchk(cudaMemcpy(value_columns_d, value_columns_h, sizeof(int)*num_value_columns*num_value_rows, cudaMemcpyHostToDevice));
-  gpuErrchk(cudaMemcpyToSymbol(ops_d, ops, sizeof(reductionType) * num_ops));
-  initializeVariable<int><<<50, BLOCKDIM>>>(hash_key_idx_d, hash_count_d, hash_results_d, HASH_TABLE_SIZE, ops_d, num_ops);
+  gpuErrchk(cudaMemcpyToSymbol(ops_c, ops, sizeof(reductionType) * num_ops));
+  initializeVariable<int><<<GRIDDIM, BLOCKDIM>>>(hash_key_idx_d, hash_count_d, hash_results_d, HASH_TABLE_SIZE, num_ops);
   gpuErrchk(cudaDeviceSynchronize());
 
   // fill hash table
-  fillTable<int, int><<<50, BLOCKDIM>>>(key_columns_d, num_key_rows, num_key_columns,
-					value_columns_d, num_value_rows, num_value_columns,
-					hash_key_idx_d, hash_count_d, hash_results_d,
-					HASH_TABLE_SIZE, ops_d, num_ops);
+  fillTable<int, int><<<GRIDDIM, BLOCKDIM>>>(key_columns_d, num_key_rows, num_key_columns,
+					     value_columns_d, num_value_rows, num_value_columns,
+					     hash_key_idx_d, hash_count_d, hash_results_d,
+					     HASH_TABLE_SIZE, num_ops);
   gpuErrchk(cudaDeviceSynchronize());
 
- // shrink the hash table to output array
+  //shrink the hash table to output array
   //Create array of idices for hash table
-  int * seq, hashTable_idxs;
+  int *seq, *hashTable_idxs;
   cudaMalloc((void**)&seq, HASH_TABLE_SIZE*sizeof(int)); //for hash index sequence  
-  cudaMalloc((void**)&key_idxs, HASH_TABLE_SIZE*sizeof(int));  //for key indexs without -1   
+  cudaMalloc((void**)&hashTable_idxs, HASH_TABLE_SIZE*sizeof(int));  //for key indexs without -1   
   thrust::device_ptr<int> hash_d_seq = thrust::device_pointer_cast(seq); //for hash index sequence 
   thrust::device_ptr<int> hashTable_idxs_d = thrust::device_pointer_cast(hashTable_idxs); //for key indexs without -1 
-  thrust::sequence(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE; //fill hash index seq
+  thrust::sequence(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE); //fill hash index seq
 
 
   //copy hash idex of keys, removeing -1's which signify not used
 //   copy_if(policy, index seq start, index seq end, hash keys for comparison, result containing idx to non -1's, comparator)
-  int *new_end = thrust::copy_if(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE, hash_key_idx_d, hashTable_idxs_d, is_not_neg_1());
+  auto newEnd = thrust::copy_if(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE, hash_key_idx_d, hashTable_idxs_d, is_not_neg_1());
 
   num_output_rows = newEnd - hashTable_idxs_d;
 
   int* output_key_columns_d = NULL;
   cudaMalloc(&output_key_columns_d, sizeof(int)*num_key_columns*num_output_rows);
-  copyUnique<int><<<50,BLOCKDIM>>>(hashTable_idxs_d, hash_key_idx_d,key_columns_d, output_key_columns_d, num_output_rows, num_key_columns, num_key_rows);
+  copyUnique<int><<<GRIDDIM,BLOCKDIM>>>(hashTable_idxs, hash_key_idx_d,key_columns_d, output_key_columns_d, num_output_rows, num_key_columns, num_key_rows);
 
   int* output_value_columns_d = NULL;
   cudaMalloc(&output_value_columns_d, sizeof(int)*num_value_columns*num_output_rows);
-  copyValues<int><<<50,BLOCKDIM>>>(hashTable_idxs_d, hash_results_d,hash_count_d, value_columns_d, output_value_columns_d, num_output_rows, num_value_columns, num_value_rows, ops_d, num_ops, HASH_TABLE_SIZE);
+  copyValues<int><<<GRIDDIM,BLOCKDIM>>>(hashTable_idxs, hash_results_d,hash_count_d, value_columns_d, output_value_columns_d, num_output_rows, num_value_columns, num_value_rows, num_ops, HASH_TABLE_SIZE);
 
   gpuErrchk(cudaDeviceSynchronize());
 
@@ -91,6 +91,7 @@ void groupby_hash_GPU(const int* key_columns_h, int num_key_columns, int num_key
   cudaFree(hash_results_d);
   cudaFree(output_key_columns_d);
   cudaFree(output_value_columns_d);
-  cudaFree(key_idxs);
+  cudaFree(seq);
+  cudaFree(hashTable_idxs);
 
 }
diff --git a/groupby_hash.cuh b/groupby_hash.cuh
@@ -0,0 +1,19 @@
+#ifndef GROUPBY_HASH_CUH
+#define GROUPBY_HASH_CUH
+
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+} 
+
+void groupby_hash_GPU(const int* key_columns_h, int num_key_columns, int num_key_rows,
+		      const int* value_columns_h, int num_value_columns, int num_value_rows,
+		      reductionType* ops, int num_ops, int* output_keys, int* output_values, int &num_output_rows);
+
+
+#endif
diff --git a/groupby_hash_templates.cu b/groupby_hash_templates.cu
@@ -33,7 +33,6 @@ size_t HashKey(size_t idx) {
 template <typename Tval> __device__
 void updateEntry(Tval* value_columns,
 		 size_t num_val_rows,
-		 reductionType* ops,
 		 size_t num_ops,
 		 size_t idx,
 		 size_t hashPos,
@@ -47,7 +46,7 @@ void updateEntry(Tval* value_columns,
   for (size_t i = 0; i < num_ops; ++i) {
     Tval value = value_columns[i * num_val_rows + idx];
     size_t val_idx = i * len_hash_table + hashPos;
-    switch(ops[i]) {
+    switch(ops_c[i]) {
     case rmin:
       atomicMin(&(hash_results[val_idx]), value);
       break;
@@ -77,7 +76,6 @@ void fillTable(Tkey* key_columns,
 	       int* hash_count,
 	       Tval* hash_results,
 	       size_t len_hash_table,
-	       reductionType* ops,
 	       size_t num_ops)
 {
   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -106,7 +104,7 @@ void fillTable(Tkey* key_columns,
       }
       // now it is safe to update the entry
       isInserted = true;
-      updateEntry<Tval>(value_columns, num_val_rows, ops, num_ops, i, curPos, hash_results, &(hash_count[curPos]), len_hash_table);
+      updateEntry<Tval>(value_columns, num_val_rows, num_ops, i, curPos, hash_results, &(hash_count[curPos]), len_hash_table);
     }
     if (!isInserted) {
       // Do sth in the case of overflowing hash table
@@ -119,7 +117,6 @@ void initializeVariable(int* hash_key_idx,
 			int* hash_count,
 			Tval* hash_results,
 			size_t len_hash_table,
-			reductionType* ops,
 			size_t num_ops)
 {
   // each thread responsible for one entry (with thread coarsening)
@@ -130,9 +127,9 @@ void initializeVariable(int* hash_key_idx,
     hash_count[i] = 0;
     for (size_t j = 0; j < num_ops; ++j) {
       // replace following with specialized limit template in the future
-      if (ops[i] == rmin) {
+      if (ops_c[j] == rmin) {
 	hash_results[j * len_hash_table + i] = cuda_custom::limits<Tval>::max();
-      } else if (ops[i] == rmax) {
+      } else if (ops_c[j] == rmax) {
         hash_results[j * len_hash_table + i] = cuda_custom::limits<Tval>::lowest();
       } else {
 	hash_results[j * len_hash_table + i] = 0;
@@ -149,12 +146,12 @@ void copyUnique(
       Tval* key_columns_d, 
       Tval* output_key_columns_d, 
       int num_output_rows, 
-      int num_key_columns.
+      int num_key_columns,
       int num_key_rows)
 {
-  idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
   while (idx < num_output_rows){
-    for (int i = 0; i < num_key_columns, i++){//each column of key matrix
+    for (int i = 0; i < num_key_columns; i++){//each column of key matrix
       output_key_columns_d[idx+num_output_rows*i] = key_columns_d[hash_key_idx_d[hashTable_idxs_d[idx]]+num_key_rows*i];//copy original key entry to output
     }
     idx += gridDim.x*blockDim.x;//increment idx by thread space
@@ -169,18 +166,17 @@ void copyValues(
       Tval* value_columns_d, 
       Tval* output_value_columns_d, 
       int num_output_rows, 
-      int num_value_columns.
+      int num_value_columns,
       int num_value_rows,
-      reductionType* ops,
       size_t num_ops,
-      size_t len_hash_table,
+      size_t len_hash_table
     )
 {
-  idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
   while (idx < num_output_rows){
     for (size_t i = 0; i < num_ops; ++i) {
       size_t val_idx = i * len_hash_table + hashTable_idxs_d[idx];
-      switch(ops[i]) {
+      switch(ops_c[i]) {
       case rmin:
         output_value_columns_d[idx+num_output_rows*i] = hash_results_d[val_idx];//copy result to output
         break;
@@ -208,6 +204,6 @@ struct is_not_neg_1
   __host__ __device__
   bool operator()(const int x)
   {
-    return x != -1s;
+    return x != -1;
   }
-};
+};
diff --git a/main_hash.cu b/main_hash.cu
@@ -0,0 +1,81 @@
+#include <iostream>
+#include <algorithm>
+#include <chrono>
+#include <vector>
+#include <string>
+#include "cpuGroupby.h"
+#include "groupby_hash.cuh"
+
+int main(int argc, const char * argv[]) {
+  using Time = std::chrono::high_resolution_clock;
+  using fsec = std::chrono::duration<float>;
+
+  int num_rows = 100000;
+  int num_key_cols = 2;
+  int num_val_cols = 3;
+  int num_distinct_keys = 10;
+  std::vector<std::string> args(argv, argv+argc);
+  if (argc == 2){
+    num_rows = stoi(args.at(1));
+  } else if(argc == 4){
+    num_rows = stoi(args.at(1));
+    num_key_cols = stoi(args.at(2));
+    num_val_cols = stoi(args.at(3));
+  } else if(argc == 5){
+    num_rows = stoi(args.at(1));
+    num_key_cols = stoi(args.at(2));
+    num_val_cols = stoi(args.at(3));
+    num_distinct_keys = stoi(args.at(4));
+  } else {
+    if (argc != 1) {
+      std::cerr << "Invalid arguments" << std::endl;
+      exit(1);
+    }
+  }
+  // Setting up the CPU groupby
+  cpuGroupby slowGroupby(num_key_cols, num_val_cols, num_rows);
+
+  slowGroupby.fillRand(num_distinct_keys, num_rows);
+
+  int *original_key_columns;
+  cudaMallocHost((void**)&original_key_columns, sizeof(int)*num_key_cols*num_rows);
+  int *original_value_columns;
+  cudaMallocHost((void**)&original_value_columns, sizeof(int)*num_val_cols*num_rows);
+  std::copy(slowGroupby.key_columns, slowGroupby.key_columns + num_key_cols*num_rows, original_key_columns);
+  std::copy(slowGroupby.value_columns, slowGroupby.value_columns + num_val_cols*num_rows, original_value_columns);
+
+  auto start = Time::now();
+
+  slowGroupby.groupby();
+
+  auto end = Time::now(); 
+  fsec cpu_duration = end - start;
+
+  // Insert GPU function calls here...
+  int *gpu_output_keys, *gpu_output_values;
+  int gpu_output_rows = 0;
+  gpu_output_keys = new int[slowGroupby.num_key_rows*slowGroupby.num_key_columns];
+  gpu_output_values = new int[slowGroupby.num_value_rows*slowGroupby.num_value_columns];
+
+  start = Time::now();
+
+  groupby_hash_GPU(original_key_columns, slowGroupby.num_key_columns,
+		   slowGroupby.num_key_rows, original_value_columns, 
+		   slowGroupby.num_value_columns, slowGroupby.num_value_rows, 
+		   slowGroupby.ops, slowGroupby.num_ops,
+		   gpu_output_keys, gpu_output_values, gpu_output_rows);
+  end = Time::now();
+
+  slowGroupby.printGPUResults(gpu_output_keys, gpu_output_values);
+
+  fsec gpu_duration = end - start;
+
+  std::cout << "CPU time: " << cpu_duration.count() << " s" << std::endl;
+  std::cout << "GPU time: " << gpu_duration.count() << " s" << std::endl;
+
+  slowGroupby.validGPUResult(gpu_output_keys, gpu_output_values, gpu_output_rows);
+
+  cudaFreeHost(original_value_columns);
+  cudaFreeHost(original_key_columns);
+  return 0;
+}
diff --git a/makefile b/makefile
@@ -12,20 +12,25 @@ endif
 
 LD_FLAGS    = -lcudart -L/usr/local/cuda/lib64
 EXE	        = groupby
+EXE_HASH	= groupby_hash
 OBJ	        = main.o cpuGroupby.o groupby.o HashFunc.o 
+OBJ_HASH	= main_hash.o cpuGroupby.o groupby_hash.o 
 
 default: $(EXE)
 
 main.o: main.cu cpuGroupby.h groupby.cu
 	$(NVCC) -c -o $@ main.cu $(NVCC_FLAGS) $(CXX_FLAGS)
 
+main_hash.o: main_hash.cu cpuGroupby.h groupby_hash.cuh
+	$(NVCC) -c -o $@ main_hash.cu $(NVCC_FLAGS) $(CXX_FLAGS)
+
 HashFunc.o: HashFunc.cu HashFunc.cuh
 	$(NVCC) -c -o $@ HashFunc.cu $(NVCC_FLAGS)
 
 groupby.o: groupby.cu 
 	$(NVCC) -c -o $@ groupby.cu $(NVCC_FLAGS)
 
-groupby_hash.o: groupby_hash.cu groupby_hash_templates.cu limits.cuh
+groupby_hash.o: groupby_hash.cu groupby_hash_templates.cu limits.cuh groupby_hash.cuh
 	$(NVCC) -c -o $@ groupby_hash.cu $(NVCC_FLAGS) $(CXX_FLAGS)
 
 cpuGroupby.o: cpuGroupby.cpp cpuGroupby.h
@@ -34,5 +39,7 @@ cpuGroupby.o: cpuGroupby.cpp cpuGroupby.h
 $(EXE): $(OBJ)
 	$(NVCC) $(OBJ) -o $(EXE) $(LD_FLAGS) $(NVCC_FLAGS)
 
+$(EXE_HASH): $(OBJ_HASH)
+	$(NVCC) $(OBJ_HASH) -o $(EXE_HASH) $(LD_FLAGS) $(NVCC_FLAGS) $(CXX_FLAG)
 clean:
 	rm -rf *.o $(EXE)