Skip to content

Commit

Permalink
Merge pull request #4 from FalsitaFine/master
Browse files Browse the repository at this point in the history
-relaunch
  • Loading branch information
ceettt authored May 5, 2019
2 parents bc3189a + ce1b4b1 commit dfd6d57
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 25 deletions.
81 changes: 59 additions & 22 deletions groupby_hash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ void groupby_hash_GPU(const int hash_size, const int* key_columns_h, int num_key

using Tval = int; // replace int with actual variable type if needed;

//set restarting flags;
int hashsize_mutiplier = 1;
int* overflow_flag = NULL;
cudaMallocManaged(&overflow_flag,sizeof(int));
overflow_flag[0] = 0; // No overflow happens

// variableAllocating
int* key_columns_d = NULL;
int* value_columns_d = NULL;
Expand Down Expand Up @@ -86,21 +92,46 @@ void groupby_hash_GPU(const int hash_size, const int* key_columns_h, int num_key
std::cout << "Predicted Hash Table Length:" << predictedLength << std::endl;
#endif


// Allocate hash table
gpuErrchk(cudaMalloc(&hash_key_idx_d, sizeof(int)*HASH_TABLE_SIZE));
gpuErrchk(cudaMalloc(&hash_count_d, sizeof(int)*HASH_TABLE_SIZE));
gpuErrchk(cudaMalloc(&hash_results_d, sizeof(Tval)*HASH_TABLE_SIZE*num_ops));

initializeVariable<int><<<GRIDDIM, BLOCKDIM>>>(hash_key_idx_d, hash_count_d, hash_results_d, HASH_TABLE_SIZE, num_ops);
gpuErrchk(cudaDeviceSynchronize());

// fill hash table
#ifndef PRIVATIZATION
fillTable<int, int><<<GRIDDIM, BLOCKDIM>>>(key_columns_d, num_key_rows, num_key_columns,
value_columns_d, num_value_rows, num_value_columns,
hash_key_idx_d, hash_count_d, hash_results_d,
HASH_TABLE_SIZE, num_ops);

do {
overflow_flag[0] = 0;



gpuErrchk(cudaMalloc(&hash_key_idx_d, sizeof(int)*HASH_TABLE_SIZE*hashsize_mutiplier));
gpuErrchk(cudaMalloc(&hash_count_d, sizeof(int)*HASH_TABLE_SIZE*hashsize_mutiplier));
gpuErrchk(cudaMalloc(&hash_results_d, sizeof(int)*HASH_TABLE_SIZE*num_ops*hashsize_mutiplier));

gpuErrchk(cudaMemcpy(key_columns_d, key_columns_h, sizeof(int)*num_key_columns*num_key_rows, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(value_columns_d, value_columns_h, sizeof(int)*num_value_columns*num_value_rows, cudaMemcpyHostToDevice));
initializeVariable<int><<<GRIDDIM, BLOCKDIM>>>(hash_key_idx_d, hash_count_d, hash_results_d, HASH_TABLE_SIZE*hashsize_mutiplier, num_ops);
gpuErrchk(cudaDeviceSynchronize());


fillTable<int, int><<<GRIDDIM, BLOCKDIM>>>(key_columns_d, num_key_rows, num_key_columns,
value_columns_d, num_value_rows, num_value_columns,
hash_key_idx_d, hash_count_d, hash_results_d,
HASH_TABLE_SIZE*hashsize_mutiplier, num_ops, overflow_flag);
gpuErrchk(cudaDeviceSynchronize());
printf("The overflow_flag is: %d\n", overflow_flag[0]);
printf("Current hash size is: %d\n", hashsize_mutiplier*HASH_TABLE_SIZE);
if (overflow_flag[0] == 1) {
hashsize_mutiplier *= 3;
cudaFree(hash_key_idx_d);
cudaFree(hash_count_d);
cudaFree(hash_results_d);
hash_key_idx_d = NULL;
hash_count_d = NULL;
hash_results_d = NULL;

}

} while(overflow_flag[0] == 1);

//printf("The overflow_flag is: %d\n", overflow_flag[0]);

#else
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
Expand All @@ -126,36 +157,42 @@ void groupby_hash_GPU(const int hash_size, const int* key_columns_h, int num_key
num_key_columns, value_columns_d,
num_value_rows, num_value_columns,
hash_key_idx_d, hash_count_d,
hash_results_d, HASH_TABLE_SIZE,
hash_results_d, HASH_TABLE_SIZE*hashsize_mutiplier,
s_len_table, num_ops);
#endif
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());

#endif
//shrink the hash table to output array
//Create array of idices for hash table
int *seq, *hashTable_idxs;
cudaMalloc((void**)&seq, HASH_TABLE_SIZE*sizeof(int)); //for hash index sequence
cudaMalloc((void**)&hashTable_idxs, HASH_TABLE_SIZE*sizeof(int)); //for key indexs without -1
int hash_table_size_fixed = HASH_TABLE_SIZE*hashsize_mutiplier;
cudaMalloc((void**)&seq, HASH_TABLE_SIZE*hashsize_mutiplier*sizeof(int)); //for hash index sequence
cudaMalloc((void**)&hashTable_idxs, HASH_TABLE_SIZE*hashsize_mutiplier*sizeof(int)); //for key indexs without -1
thrust::device_ptr<int> hash_d_seq = thrust::device_pointer_cast(seq); //for hash index sequence
thrust::device_ptr<int> hashTable_idxs_d = thrust::device_pointer_cast(hashTable_idxs); //for key indexs without -1
thrust::sequence(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE); //fill hash index seq
thrust::sequence(thrust::device, hash_d_seq, hash_d_seq + hash_table_size_fixed); //fill hash index seq


//copy hash idex of keys, removeing -1's which signify not used
// copy_if(policy, index seq start, index seq end, hash keys for comparison, result containing idx to non -1's, comparator)
auto newEnd = thrust::copy_if(thrust::device, hash_d_seq, hash_d_seq + HASH_TABLE_SIZE, hash_key_idx_d, hashTable_idxs_d, is_pos());
auto newEnd = thrust::copy_if(thrust::device, hash_d_seq, hash_d_seq + hash_table_size_fixed, hash_key_idx_d, hashTable_idxs_d, is_pos());

num_output_rows = newEnd - hashTable_idxs_d;
printf("%d output rows!\n", num_output_rows);
printf("%d hash length!\n", HASH_TABLE_SIZE*hashsize_mutiplier);

int* output_key_columns_d = NULL;
cudaMalloc(&output_key_columns_d, sizeof(int)*num_key_columns*num_output_rows);
copyUnique<int><<<GRIDDIM,BLOCKDIM>>>(hashTable_idxs, hash_key_idx_d,key_columns_d, output_key_columns_d, num_output_rows, num_key_columns, num_key_rows);

//gpuErrchk(cudaDeviceSynchronize());

int* output_value_columns_d = NULL;
cudaMalloc(&output_value_columns_d, sizeof(int)*num_value_columns*num_output_rows);
copyValues<int><<<GRIDDIM,BLOCKDIM>>>(hashTable_idxs, hash_results_d,hash_count_d, value_columns_d, output_value_columns_d, num_output_rows, num_value_columns, num_value_rows, num_ops, HASH_TABLE_SIZE);
gpuErrchk(cudaMalloc(&output_value_columns_d, sizeof(int)*num_value_columns*num_output_rows));
copyValues<int><<<GRIDDIM,BLOCKDIM>>>(hashTable_idxs, hash_results_d,hash_count_d, value_columns_d, output_value_columns_d, num_output_rows, num_value_columns, num_value_rows, num_ops, hash_table_size_fixed);

printf("%d,%d\n",BLOCKDIM,GRIDDIM);
printf("waiting for Sync\n");

gpuErrchk(cudaDeviceSynchronize());

Expand Down
11 changes: 8 additions & 3 deletions groupby_hash_templates.cu
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,11 @@ void fillTable(Tkey* key_columns,
int* hash_count,
Tval* hash_results,
size_t len_hash_table,
size_t num_ops)
size_t num_ops,
int* overflow_flag
)
{

size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
size_t offset = gridDim.x * blockDim.x;
for (size_t i = idx; i < num_key_rows; i += offset) {
Expand All @@ -103,7 +106,7 @@ void fillTable(Tkey* key_columns,
if (!keyEqualCM<Tkey>(key_columns, (size_t)old, i, num_key_rows, num_key_cols)) {
// collision
curPos = (curPos + 1) % len_hash_table; // linear probing
if (++collisionCount == len_hash_table)
if (++collisionCount >= len_hash_table * 0.75)
break; // break the loop if it looped over the hash table and still failed
continue;
}
Expand All @@ -114,6 +117,8 @@ void fillTable(Tkey* key_columns,
}
if (!isInserted) {
// Do sth in the case of overflowing hash table
overflow_flag[0] = 1;
//printf("Overflow happened at %d \n", len_hash_table);
}
}
}
Expand All @@ -136,7 +141,6 @@ void fillTable_privatization(Tkey* key_columns,
size_t offset = gridDim.x * blockDim.x;
__shared__ unsigned int filled_hash_table_shared;
extern __shared__ char hash_table_shared[];

int* s_hash_key_idx = (int*)hash_table_shared;
int* s_hash_count = (int*)&(hash_table_shared[len_shared_hash_table*sizeof(int)]);
size_t s_offset = (2*len_shared_hash_table*sizeof(int) + sizeof(Tval) - 1) / sizeof(Tval);
Expand Down Expand Up @@ -325,6 +329,7 @@ void copyValues(
)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
//printf("%d\n",idx);
while (idx < num_output_rows){
for (size_t i = 0; i < num_ops; ++i) {
size_t val_idx = i * len_hash_table + hashTable_idxs_d[idx];
Expand Down

0 comments on commit dfd6d57

Please sign in to comment.