Skip to content

Commit

Permalink
cpu and gpu sampling prediction
Browse files Browse the repository at this point in the history
  • Loading branch information
Tonglin Chen committed May 5, 2019
1 parent 53add47 commit e745f98
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 6 deletions.
47 changes: 43 additions & 4 deletions groupby_hash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <thrust/device_ptr.h>
#include <random>
#include <iostream>
#include <cmath>
#include <curand.h>
#include <curand_kernel.h>

#include "cpuGroupby.h"
#include "groupby_hash.cuh"
Expand Down Expand Up @@ -45,14 +50,48 @@ void groupby_hash_GPU(const int hash_size, const int* key_columns_h, int num_key

gpuErrchk(cudaMalloc(&key_columns_d, sizeof(int)*num_key_columns*num_key_rows));
gpuErrchk(cudaMalloc(&value_columns_d, sizeof(int)*num_value_columns*num_value_rows));
gpuErrchk(cudaMalloc(&hash_key_idx_d, sizeof(int)*HASH_TABLE_SIZE));
gpuErrchk(cudaMalloc(&hash_count_d, sizeof(int)*HASH_TABLE_SIZE));
gpuErrchk(cudaMalloc(&hash_results_d, sizeof(Tval)*HASH_TABLE_SIZE*num_ops));

// initialize values
// copy to target
gpuErrchk(cudaMemcpy(key_columns_d, key_columns_h, sizeof(int)*num_key_columns*num_key_rows, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(value_columns_d, value_columns_h, sizeof(int)*num_value_columns*num_value_rows, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(ops_c, ops, sizeof(reductionType) * num_ops));

// sample hash table length
#ifdef CPU_SAMPLE
unsigned int predictedLength = predictTableLength_CPU<int>(key_columns_h,
num_key_rows,
num_key_columns);
std::cout << "Predicted Hash Table Length:" << predictedLength << std::endl;
#elif defined(GPU_SAMPLE)
unsigned int* count = NULL;
curandState* state = NULL;
gpuErrchk(cudaMallocManaged(&count, sizeof(unsigned int)*3));
gpuErrchk(cudaMalloc(&state, 1*BLOCKDIM*sizeof(curandState)));
unsigned int iterations = num_key_rows / BLOCKDIM / 100 + 1;
fillCURANDState<<<1, BLOCKDIM>>>(state, gen());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
predictTableLength_GPU<int><<<1, BLOCKDIM>>>(key_columns_d,
num_key_rows,
num_key_columns,
iterations,
count,
state);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());

unsigned int countTotal = count[0] + count[1] + count[2];
float delta = std::sqrt((float)countTotal*((float)countTotal*9 - (float)count[1]*12));
unsigned int predictedLength = 2.6 * ((3*countTotal + delta) / (2*count[1]));
std::cout << "Predicted Hash Table Length:" << predictedLength << std::endl;
#endif


// Allocate hash table
gpuErrchk(cudaMalloc(&hash_key_idx_d, sizeof(int)*HASH_TABLE_SIZE));
gpuErrchk(cudaMalloc(&hash_count_d, sizeof(int)*HASH_TABLE_SIZE));
gpuErrchk(cudaMalloc(&hash_results_d, sizeof(Tval)*HASH_TABLE_SIZE*num_ops));

initializeVariable<int><<<GRIDDIM, BLOCKDIM>>>(hash_key_idx_d, hash_count_d, hash_results_d, HASH_TABLE_SIZE, num_ops);
gpuErrchk(cudaDeviceSynchronize());

Expand Down
96 changes: 94 additions & 2 deletions groupby_hash_templates.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

// assume column major here
template <typename T> __host__ __device__
bool keyEqualCM(T* key_columns, size_t idx1, size_t idx2, size_t num_key_rows, size_t num_key_columns)
bool keyEqualCM(const T* key_columns, size_t idx1, size_t idx2, size_t num_key_rows, size_t num_key_columns)
{
for (size_t i=0; i < num_key_columns; ++i) {
if (key_columns[i*num_key_rows+idx1] != key_columns[i*num_key_rows+idx2])
Expand All @@ -15,7 +15,7 @@ bool keyEqualCM(T* key_columns, size_t idx1, size_t idx2, size_t num_key_rows, s

// assume row major here
template <typename T> __host__ __device__
bool keyEqualRM(T* key_columns, size_t idx1, size_t idx2, size_t num_key_rows, size_t num_key_columns)
bool keyEqualRM(const T* key_columns, size_t idx1, size_t idx2, size_t num_key_rows, size_t num_key_columns)
{
for (size_t i=0; i < num_key_columns; ++i) {
if (key_columns[i+num_key_rows*idx1] != key_columns[i+num_key_rows*idx2])
Expand Down Expand Up @@ -359,3 +359,95 @@ struct is_pos
return x >= 0;
}
};

extern std::mt19937 gen;

template <typename T> __host__
unsigned int predictTableLength_CPU(const T* key_columns,
size_t num_key_rows,
size_t num_key_columns)
{
// Predict Hash Table length based on 2 state transfer matrix
unsigned int numEqual = 0;
unsigned int numTotal = 0;

std::uniform_int_distribution<unsigned int> keyRange(0, num_key_rows-1);

// max try 1% of key_rows
for (size_t i=0; i < num_key_rows/100; ++i) {
size_t idx1 = keyRange(gen);
size_t idx2 = keyRange(gen);
bool result = keyEqualCM(key_columns, idx1, idx2, num_key_rows, num_key_columns);
if (result)
++numEqual;
++numTotal;
if (numEqual == 10)
break;
}
if (numEqual < 2) // very few sample, return 1/4 of original
return num_key_rows / 4;
return (unsigned int) 2.6f * ((float)(numTotal) / numEqual);
}

__global__
void fillCURANDState(curandState* state, unsigned long seed)
{
size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
curand_init(seed, idx, 0, &state[idx]);
}

template <typename T> __global__
void predictTableLength_GPU(const T* key_columns,
size_t num_key_rows,
size_t num_key_columns,
size_t iterations,
unsigned int* count,
curandState* state)
{
#ifdef DEBUG
constexpr unsigned int BLOCKSIZE = 512;
#else
constexpr unsigned int BLOCKSIZE = 1024;
#endif

__shared__ unsigned int count_shared[3*BLOCKSIZE];
size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
// initial shared memory
for (size_t i = 0; i < 3; ++i) {
count_shared[i*BLOCKSIZE + threadIdx.x] = 0;
}
for (size_t i = 0; i < iterations; ++i) {
unsigned int test_idx[3];
bool result[3];
for (size_t j = 0; j < 3; ++j)
test_idx[j] = floorf(curand_uniform(&state[idx]) * num_key_rows);
// compare keys
for (size_t j = 0; j < 3; ++j)
result[j] = keyEqualCM(key_columns, test_idx[j],
test_idx[(j+1)%3], num_key_rows,
num_key_columns);
if (result[0] && result[1]) // any two is true then 3 are equal
count_shared[threadIdx.x] += 1;
else if (result[0] || result[1] || result[2]) // any one is true then 2 are equal
count_shared[BLOCKSIZE + threadIdx.x] += 1;
else // three are different
count_shared[BLOCKSIZE*2 + threadIdx.x] += 1;
}
__syncthreads();
// reduction
for (size_t stride = (blockDim.x >> 1);
stride >= 1;
stride >>= 1) {
if (threadIdx.x < stride) {
for (size_t i = 0; i < 3; ++i) {
count_shared[threadIdx.x + BLOCKSIZE*i]
+= count_shared[threadIdx.x + BLOCKSIZE*i + stride];
}
}
__syncthreads();
}
if (threadIdx.x == 0)
for (size_t i = 0; i < 3; ++i) {
count[i] = count_shared[BLOCKSIZE*i];
}
}
8 changes: 8 additions & 0 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ ifdef TESLA
CXX_FLAGS += -DTESLA
endif

ifdef GPU_SAMPLE
CXX_FLAGS += -DGPU_SAMPLE
endif

ifdef CPU_SAMPLE
CXX_FLAGS += -DCPU_SAMPLE
endif

LD_FLAGS = -lcudart -L/usr/local/cuda/lib64
EXE = groupby
EXE_HASH = groupby_hash
Expand Down

0 comments on commit e745f98

Please sign in to comment.