Merge pull request #4 from FalsitaFine/master

-relaunch
chrispypatt · May 5, 2019 · dfd6d57 · dfd6d57
2 parents bc3189a + ce1b4b1
commit dfd6d57
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 25 deletions.
diff --git a/groupby_hash.cu b/groupby_hash.cu
@@ -41,6 +41,12 @@ void groupby_hash_GPU(const int hash_size, const int* key_columns_h, int num_key
 
   using Tval = int; // replace int with actual variable type if needed;
 
+  //set restarting flags;
+  int hashsize_mutiplier = 1;
+  int* overflow_flag = NULL;
+  cudaMallocManaged(&overflow_flag,sizeof(int));
+  overflow_flag[0] = 0; // No overflow happens
+
   // variableAllocating
   int* key_columns_d = NULL;
   int* value_columns_d = NULL;
@@ -86,21 +92,46 @@ void groupby_hash_GPU(const int hash_size, const int* key_columns_h, int num_key
   std::cout << "Predicted Hash Table Length:" << predictedLength << std::endl;
 #endif
 
-
-  // Allocate hash table
-  gpuErrchk(cudaMalloc(&hash_key_idx_d, sizeof(int)*HASH_TABLE_SIZE));
-  gpuErrchk(cudaMalloc(&hash_count_d, sizeof(int)*HASH_TABLE_SIZE));
-  gpuErrchk(cudaMalloc(&hash_results_d, sizeof(Tval)*HASH_TABLE_SIZE*num_ops));
-
-  initializeVariable<int><<<GRIDDIM, BLOCKDIM>>>(hash_key_idx_d, hash_count_d, hash_results_d, HASH_TABLE_SIZE, num_ops);
-  gpuErrchk(cudaDeviceSynchronize());
 
-  // fill hash table
 #ifndef PRIVATIZATION
-  fillTable<int, int><<<GRIDDIM, BLOCKDIM>>>(key_columns_d, num_key_rows, num_key_columns,
-					     value_columns_d, num_value_rows, num_value_columns,
-					     hash_key_idx_d, hash_count_d, hash_results_d,
-					     HASH_TABLE_SIZE, num_ops);
+
+  do {
+    overflow_flag[0] = 0;
+
+
+
+    gpuErrchk(cudaMalloc(&hash_key_idx_d, sizeof(int)*HASH_TABLE_SIZE*hashsize_mutiplier));
+    gpuErrchk(cudaMalloc(&hash_count_d, sizeof(int)*HASH_TABLE_SIZE*hashsize_mutiplier));
+    gpuErrchk(cudaMalloc(&hash_results_d, sizeof(int)*HASH_TABLE_SIZE*num_ops*hashsize_mutiplier)); 
+
+    gpuErrchk(cudaMemcpy(key_columns_d, key_columns_h, sizeof(int)*num_key_columns*num_key_rows, cudaMemcpyHostToDevice));
+    gpuErrchk(cudaMemcpy(value_columns_d, value_columns_h, sizeof(int)*num_value_columns*num_value_rows, cudaMemcpyHostToDevice));
+    initializeVariable<int><<<GRIDDIM, BLOCKDIM>>>(hash_key_idx_d, hash_count_d, hash_results_d, HASH_TABLE_SIZE*hashsize_mutiplier, num_ops);
+    gpuErrchk(cudaDeviceSynchronize());
+
+
+    fillTable<int, int><<<GRIDDIM, BLOCKDIM>>>(key_columns_d, num_key_rows, num_key_columns,
+               value_columns_d, num_value_rows, num_value_columns,
+               hash_key_idx_d, hash_count_d, hash_results_d,
+               HASH_TABLE_SIZE*hashsize_mutiplier, num_ops, overflow_flag);
+    gpuErrchk(cudaDeviceSynchronize());
+    printf("The overflow_flag is: %d\n", overflow_flag[0]);
+    printf("Current hash size is: %d\n", hashsize_mutiplier*HASH_TABLE_SIZE);
+    if (overflow_flag[0] == 1) {
+      hashsize_mutiplier *= 3;
+      cudaFree(hash_key_idx_d);
+      cudaFree(hash_count_d);
+      cudaFree(hash_results_d);
+      hash_key_idx_d = NULL;
+      hash_count_d = NULL;
+      hash_results_d = NULL;
+
+    }
+
+  } while(overflow_flag[0] == 1);
+
+    //printf("The overflow_flag is: %d\n", overflow_flag[0]);
+
 #else
   cudaDeviceProp deviceProp;
   cudaGetDeviceProperties(&deviceProp, 0);
@@ -126,36 +157,42 @@ void groupby_hash_GPU(const int hash_size, const int* key_columns_h, int num_key
 							num_key_columns, value_columns_d,
 							num_value_rows, num_value_columns,
 							hash_key_idx_d, hash_count_d,
-							hash_results_d, HASH_TABLE_SIZE,
+							hash_results_d, HASH_TABLE_SIZE*hashsize_mutiplier,
 							s_len_table, num_ops);
-#endif
   gpuErrchk(cudaPeekAtLastError());
   gpuErrchk(cudaDeviceSynchronize());
-
+#endif
   //shrink the hash table to output array
   //Create array of idices for hash table
   int *seq, *hashTable_idxs;
-  cudaMalloc((void**)&seq, HASH_TABLE_SIZE*sizeof(int)); //for hash index sequence  
-  cudaMalloc((void**)&hashTable_idxs, HASH_TABLE_SIZE*sizeof(int));  //for key indexs without -1   
+  int hash_table_size_fixed = HASH_TABLE_SIZE*hashsize_mutiplier;
+  cudaMalloc((void**)&seq, HASH_TABLE_SIZE*hashsize_mutiplier*sizeof(int)); //for hash index sequence  
+  cudaMalloc((void**)&hashTable_idxs, HASH_TABLE_SIZE*hashsize_mutiplier*sizeof(int));  //for key indexs without -1   
   thrust::device_ptr<int> hash_d_seq = thrust::device_pointer_cast(seq); //for hash index sequence 
   thrust::device_ptr<int> hashTable_idxs_d = thrust::device_pointer_cast(hashTable_idxs); //for key indexs without -1 
-  thrust::sequence(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE); //fill hash index seq
+  thrust::sequence(thrust::device, hash_d_seq, hash_d_seq + hash_table_size_fixed); //fill hash index seq
 
 
   //copy hash idex of keys, removeing -1's which signify not used
 //   copy_if(policy, index seq start, index seq end, hash keys for comparison, result containing idx to non -1's, comparator)
-  auto newEnd = thrust::copy_if(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE, hash_key_idx_d, hashTable_idxs_d, is_pos());
+  auto newEnd = thrust::copy_if(thrust::device, hash_d_seq, hash_d_seq + hash_table_size_fixed, hash_key_idx_d, hashTable_idxs_d, is_pos());
 
   num_output_rows = newEnd - hashTable_idxs_d;
   printf("%d output rows!\n", num_output_rows);
+  printf("%d hash length!\n", HASH_TABLE_SIZE*hashsize_mutiplier);
 
   int* output_key_columns_d = NULL;
   cudaMalloc(&output_key_columns_d, sizeof(int)*num_key_columns*num_output_rows);
   copyUnique<int><<<GRIDDIM,BLOCKDIM>>>(hashTable_idxs, hash_key_idx_d,key_columns_d, output_key_columns_d, num_output_rows, num_key_columns, num_key_rows);
 
+  //gpuErrchk(cudaDeviceSynchronize());
+
   int* output_value_columns_d = NULL;
-  cudaMalloc(&output_value_columns_d, sizeof(int)*num_value_columns*num_output_rows);
-  copyValues<int><<<GRIDDIM,BLOCKDIM>>>(hashTable_idxs, hash_results_d,hash_count_d, value_columns_d, output_value_columns_d, num_output_rows, num_value_columns, num_value_rows, num_ops, HASH_TABLE_SIZE);
+  gpuErrchk(cudaMalloc(&output_value_columns_d, sizeof(int)*num_value_columns*num_output_rows));
+  copyValues<int><<<GRIDDIM,BLOCKDIM>>>(hashTable_idxs, hash_results_d,hash_count_d, value_columns_d, output_value_columns_d, num_output_rows, num_value_columns, num_value_rows, num_ops, hash_table_size_fixed);
+
+  printf("%d,%d\n",BLOCKDIM,GRIDDIM);
+  printf("waiting for Sync\n");
 
   gpuErrchk(cudaDeviceSynchronize());
 

diff --git a/groupby_hash_templates.cu b/groupby_hash_templates.cu
@@ -80,8 +80,11 @@ void fillTable(Tkey* key_columns,
 	       int* hash_count,
 	       Tval* hash_results,
 	       size_t len_hash_table,
-	       size_t num_ops)
+	       size_t num_ops,
+         int* overflow_flag
+         )
 {
+
   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
   size_t offset = gridDim.x * blockDim.x;
   for (size_t i = idx; i < num_key_rows; i += offset) {
@@ -103,7 +106,7 @@ void fillTable(Tkey* key_columns,
 	if (!keyEqualCM<Tkey>(key_columns, (size_t)old, i, num_key_rows, num_key_cols)) {
 	  // collision
 	  curPos = (curPos + 1) % len_hash_table; // linear probing
-	  if (++collisionCount == len_hash_table)
+	  if (++collisionCount >= len_hash_table * 0.75)
 	    break; // break the loop if it looped over the hash table and still failed
 	  continue;
 	}
@@ -114,6 +117,8 @@ void fillTable(Tkey* key_columns,
     }
     if (!isInserted) {
       // Do sth in the case of overflowing hash table
+      overflow_flag[0] = 1;
+      //printf("Overflow happened at %d \n", len_hash_table);
     }
   }
 }
@@ -136,7 +141,6 @@ void fillTable_privatization(Tkey* key_columns,
   size_t offset = gridDim.x * blockDim.x;
   __shared__ unsigned int filled_hash_table_shared;
   extern __shared__ char hash_table_shared[];
-
   int* s_hash_key_idx = (int*)hash_table_shared;
   int* s_hash_count = (int*)&(hash_table_shared[len_shared_hash_table*sizeof(int)]);
   size_t s_offset = (2*len_shared_hash_table*sizeof(int) + sizeof(Tval) - 1) / sizeof(Tval);
@@ -325,6 +329,7 @@ void copyValues(
     )
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  //printf("%d\n",idx);
   while (idx < num_output_rows){
     for (size_t i = 0; i < num_ops; ++i) {
       size_t val_idx = i * len_hash_table + hashTable_idxs_d[idx];