cpu and gpu sampling prediction

chrispypatt · May 5, 2019 · e745f98 · e745f98
1 parent 53add47
commit e745f98
Show file tree

Hide file tree

Showing 3 changed files with 145 additions and 6 deletions.
diff --git a/groupby_hash.cu b/groupby_hash.cu
@@ -3,6 +3,11 @@
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/device_ptr.h>
+#include <random>
+#include <iostream>
+#include <cmath>
+#include <curand.h>
+#include <curand_kernel.h>
 
 #include "cpuGroupby.h"
 #include "groupby_hash.cuh"
@@ -45,14 +50,48 @@ void groupby_hash_GPU(const int hash_size, const int* key_columns_h, int num_key
 
   gpuErrchk(cudaMalloc(&key_columns_d, sizeof(int)*num_key_columns*num_key_rows));
   gpuErrchk(cudaMalloc(&value_columns_d, sizeof(int)*num_value_columns*num_value_rows));
-  gpuErrchk(cudaMalloc(&hash_key_idx_d, sizeof(int)*HASH_TABLE_SIZE));
-  gpuErrchk(cudaMalloc(&hash_count_d, sizeof(int)*HASH_TABLE_SIZE));
-  gpuErrchk(cudaMalloc(&hash_results_d, sizeof(Tval)*HASH_TABLE_SIZE*num_ops));
 
-  // initialize values
+  // copy to target
   gpuErrchk(cudaMemcpy(key_columns_d, key_columns_h, sizeof(int)*num_key_columns*num_key_rows, cudaMemcpyHostToDevice));
   gpuErrchk(cudaMemcpy(value_columns_d, value_columns_h, sizeof(int)*num_value_columns*num_value_rows, cudaMemcpyHostToDevice));
   gpuErrchk(cudaMemcpyToSymbol(ops_c, ops, sizeof(reductionType) * num_ops));
+
+  // sample hash table length
+#ifdef CPU_SAMPLE
+  unsigned int predictedLength = predictTableLength_CPU<int>(key_columns_h,
+							     num_key_rows,
+							     num_key_columns);
+  std::cout << "Predicted Hash Table Length:" << predictedLength << std::endl;
+#elif defined(GPU_SAMPLE)
+  unsigned int* count = NULL;
+  curandState* state = NULL;
+  gpuErrchk(cudaMallocManaged(&count, sizeof(unsigned int)*3));
+  gpuErrchk(cudaMalloc(&state, 1*BLOCKDIM*sizeof(curandState)));
+  unsigned int iterations = num_key_rows / BLOCKDIM / 100 + 1;
+  fillCURANDState<<<1, BLOCKDIM>>>(state, gen());
+  gpuErrchk(cudaPeekAtLastError());
+  gpuErrchk(cudaDeviceSynchronize());
+  predictTableLength_GPU<int><<<1, BLOCKDIM>>>(key_columns_d,
+					       num_key_rows,
+					       num_key_columns,
+					       iterations,
+					       count,
+					       state);
+  gpuErrchk(cudaPeekAtLastError());
+  gpuErrchk(cudaDeviceSynchronize());
+
+  unsigned int countTotal = count[0] + count[1] + count[2];
+  float delta = std::sqrt((float)countTotal*((float)countTotal*9 - (float)count[1]*12));
+  unsigned int predictedLength = 2.6 * ((3*countTotal + delta) / (2*count[1]));
+  std::cout << "Predicted Hash Table Length:" << predictedLength << std::endl;
+#endif
+
+
+  // Allocate hash table
+  gpuErrchk(cudaMalloc(&hash_key_idx_d, sizeof(int)*HASH_TABLE_SIZE));
+  gpuErrchk(cudaMalloc(&hash_count_d, sizeof(int)*HASH_TABLE_SIZE));
+  gpuErrchk(cudaMalloc(&hash_results_d, sizeof(Tval)*HASH_TABLE_SIZE*num_ops));
+
   initializeVariable<int><<<GRIDDIM, BLOCKDIM>>>(hash_key_idx_d, hash_count_d, hash_results_d, HASH_TABLE_SIZE, num_ops);
   gpuErrchk(cudaDeviceSynchronize());
 

diff --git a/groupby_hash_templates.cu b/groupby_hash_templates.cu
@@ -4,7 +4,7 @@
 
 // assume column major here
 template <typename T> __host__ __device__
-bool keyEqualCM(T* key_columns, size_t idx1, size_t idx2, size_t num_key_rows, size_t num_key_columns)
+bool keyEqualCM(const T* key_columns, size_t idx1, size_t idx2, size_t num_key_rows, size_t num_key_columns)
 {
   for (size_t i=0; i < num_key_columns; ++i) {
     if (key_columns[i*num_key_rows+idx1] != key_columns[i*num_key_rows+idx2])
@@ -15,7 +15,7 @@ bool keyEqualCM(T* key_columns, size_t idx1, size_t idx2, size_t num_key_rows, s
 
 // assume row major here
 template <typename T> __host__ __device__
-bool keyEqualRM(T* key_columns, size_t idx1, size_t idx2, size_t num_key_rows, size_t num_key_columns)
+bool keyEqualRM(const T* key_columns, size_t idx1, size_t idx2, size_t num_key_rows, size_t num_key_columns)
 {
   for (size_t i=0; i < num_key_columns; ++i) {
     if (key_columns[i+num_key_rows*idx1] != key_columns[i+num_key_rows*idx2])
@@ -359,3 +359,95 @@ struct is_pos
     return x >= 0;
   }
 };
+
+extern std::mt19937 gen;
+
+template <typename T> __host__
+unsigned int predictTableLength_CPU(const T* key_columns,
+				    size_t num_key_rows,
+				    size_t num_key_columns)
+{
+  // Predict Hash Table length based on 2 state transfer matrix
+  unsigned int numEqual = 0;
+  unsigned int numTotal = 0;
+
+  std::uniform_int_distribution<unsigned int> keyRange(0, num_key_rows-1);
+
+  // max try 1% of key_rows
+  for (size_t i=0; i < num_key_rows/100; ++i) {
+    size_t idx1 = keyRange(gen);
+    size_t idx2 = keyRange(gen);
+    bool result = keyEqualCM(key_columns, idx1, idx2, num_key_rows, num_key_columns);
+    if (result) 
+      ++numEqual;
+    ++numTotal;
+    if (numEqual == 10)
+      break;
+  }
+  if (numEqual < 2) // very few sample, return 1/4 of original
+    return num_key_rows / 4;
+  return (unsigned int) 2.6f * ((float)(numTotal) / numEqual);
+}
+
+__global__
+void fillCURANDState(curandState* state, unsigned long seed)
+{
+  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  curand_init(seed, idx, 0, &state[idx]);
+}
+
+template <typename T> __global__
+void predictTableLength_GPU(const T* key_columns,
+			    size_t num_key_rows,
+			    size_t num_key_columns,
+			    size_t iterations,
+			    unsigned int* count,
+			    curandState* state)
+{  
+#ifdef DEBUG
+  constexpr unsigned int BLOCKSIZE = 512;
+#else
+  constexpr unsigned int BLOCKSIZE = 1024;
+#endif
+
+  __shared__ unsigned int count_shared[3*BLOCKSIZE];
+  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  // initial shared memory
+  for (size_t i = 0; i < 3; ++i) {
+    count_shared[i*BLOCKSIZE + threadIdx.x] = 0;
+  }
+  for (size_t i = 0; i < iterations; ++i) {
+    unsigned int test_idx[3];
+    bool result[3];
+    for (size_t j = 0; j < 3; ++j) 
+      test_idx[j] = floorf(curand_uniform(&state[idx]) * num_key_rows);
+    // compare keys
+    for (size_t j = 0; j < 3; ++j) 
+      result[j] = keyEqualCM(key_columns, test_idx[j],
+			     test_idx[(j+1)%3], num_key_rows,
+			     num_key_columns);
+    if (result[0] && result[1]) // any two is true then 3 are equal
+      count_shared[threadIdx.x] += 1;
+    else if (result[0] || result[1] || result[2]) // any one is true then 2 are equal
+      count_shared[BLOCKSIZE + threadIdx.x] += 1;
+    else // three are different
+      count_shared[BLOCKSIZE*2 + threadIdx.x] += 1;
+  }
+  __syncthreads();
+  // reduction
+  for (size_t stride = (blockDim.x >> 1);
+       stride >= 1;
+       stride >>= 1) {
+    if (threadIdx.x < stride) {
+      for (size_t i = 0; i < 3; ++i) {
+	count_shared[threadIdx.x + BLOCKSIZE*i]
+	  += count_shared[threadIdx.x + BLOCKSIZE*i + stride];  
+      }
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0)
+    for (size_t i = 0; i < 3; ++i) {
+      count[i] = count_shared[BLOCKSIZE*i];
+    }
+}
diff --git a/makefile b/makefile
@@ -22,6 +22,14 @@ ifdef TESLA
 	CXX_FLAGS += -DTESLA
 endif
 
+ifdef GPU_SAMPLE
+	CXX_FLAGS += -DGPU_SAMPLE
+endif
+
+ifdef CPU_SAMPLE
+	CXX_FLAGS += -DCPU_SAMPLE
+endif
+
 LD_FLAGS    = -lcudart -L/usr/local/cuda/lib64
 EXE	        = groupby
 EXE_HASH	= groupby_hash