diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..c195308 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -3,12 +3,37 @@ CUDA Character Recognition **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Eric Micallef + * https://www.linkedin.com/in/eric-micallef-99291714b/ + +* Tested on: Windows 10, i5, Nvidia GTX 1660 (Personal) -### (TODO: Your README) +### Analysis -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Unfortunately, I will be taking a zero for this part of the project. I implemented some stuff on the GPU and CPU but not hardly enough to warrant any respect. +Thursday morning I flew to Michigan to be in a wedding and came back Monday morning. Needless to say I underestimated time. + +Instead of grading my project please enjoy these nice photos of me from the wedding and a yak I found near my house in MI. + + + +* Me trying to make my neural network in one day ... (yes that is me ... don't lock your keys in your trunk when you are in the middle of no where Michigan ) + +![](img/sawzall.gif) + +* Them feels when you think you know what back propagation is but then you don't .... + +![](img/n1.jpg) + +* Me pointing at my working neural network like ... + +![](img/nn.jpg) + +* A random yak in Michigan? + +![](img/yak.jpg#center) + +* finally its over ... + +![](img/ss.jpg) diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..bfd956f 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,5 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_70 ) diff --git a/Project2-Character-Recognition/character_recognition/common.cu b/Project2-Character-Recognition/character_recognition/common.cu index 2a754d4..4966397 100644 --- a/Project2-Character-Recognition/character_recognition/common.cu +++ b/Project2-Character-Recognition/character_recognition/common.cu @@ -13,3 +13,16 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); exit(EXIT_FAILURE); } + +//void memory_debug_float(int elements, float* cuda_mem, float* cpu_mem) +//{ +// printf("elements %d\n ", elements); +// cudaMemcpy(cpu_mem, cuda_mem, elements * sizeof(float), cudaMemcpyDeviceToHost); +// checkCUDAErrorFn("debug failed!"); +// printf("=============================\n"); +// for (int i = 0; i < elements; i++) +// { +// printf("out[%d] %d \n", i, cpu_mem[i]); +// } +// printf("=============================\n"); +//} \ No newline at end of file diff --git a/Project2-Character-Recognition/character_recognition/common.h b/Project2-Character-Recognition/character_recognition/common.h index 6aede64..15f3b2c 100644 --- a/Project2-Character-Recognition/character_recognition/common.h +++ b/Project2-Character-Recognition/character_recognition/common.h @@ -17,6 +17,7 @@ * Check for CUDA errors; print and exit if there was a problem. */ void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); +//void memory_debug_float(int elements, float* cuda_mem, float* cpu_mem); inline int ilog2(int x) { int lg = 0; diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..2b06850 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -2,6 +2,13 @@ #include #include "common.h" #include "mlp.h" +#include + +#define NUM_LAYERS 2 +#define INPUT_NODES 2 +#define NUM_WEIGHTS 2 + +#define blockSize 128 namespace CharacterRecognition { using Common::PerformanceTimer; @@ -10,18 +17,257 @@ namespace CharacterRecognition { static PerformanceTimer timer; return timer; } + + void memory_debug_float(int elements, float* cuda_mem, float* cpu_mem) + { + printf("elements %d\n ", elements); + cudaMemcpy(cpu_mem, cuda_mem, elements * sizeof(float), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("debug failed!"); + printf("=============================\n"); + for (int i = 0; i < elements; i++) + { + printf("out[%d] %f \n", i, cpu_mem[i]); + } + printf("=============================\n"); + } + // TODO: __global__ + __global__ void kernel_feed_forward(int n, float* dev_in, float* weights) + { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (tid >= n) + { + return; + } + + float data = dev_in[tid]; + + dev_in[tid] = (data * weights[tid]) + (data * weights[tid+n]); + } + + __global__ void kernel_activate(int n, float* dev_in) + { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (tid >= n) + { + return; + } + + float var = (dev_in[tid] * -1); + float e = expf(var); + dev_in[tid] = 1 / (1 + e); + } + + float transfer_function(float in) + { + float var = (in * -1); + float e = exp(var); + return( 1 / (1 + e) ); + } + + //float transfer_derivative(float var) + //{ + // float var = (var * -1); + // float e = exp(var); + // return(1 - (1 / (1 + e))); + //} + + void feed_forward(float* in, float* out, float* weights, int length) + { + for (int i = 0; i < length; i++) + { + float temp = in[i]; + out[i] = 0; + + for (int j = 0; j < INPUT_NODES; j++) // or layers? + { + float weight = *((weights+i* INPUT_NODES) + j); + out[i] += (temp * weight); + printf("in[%d] = %f * %f\n", i, temp, weight); + } + + printf("feed[%d] = %f\n", i, out[i]); + out[i] = transfer_function(out[i]); + printf("activate: %f\n", out[i]); + } + } + + //void activate_function(float* in,float* out, int length) + //{ + // for (int i = 0; i < length; i++) + // { + // float var = (in[i] * -1); + // float e = exp(var); + // out[i] = 1 / (1 + e); + // printf("activate: %f\n", in[i]); + // } + //} + + //void update_weights(float* delta_weights, float* weights, float* gradient, float* data, int length) + //{ + // for (int i = 0; i < 6; i++) + // { + + // } + //} + + //float calculate_hidden_gradient( float* weight, float* gradient ) + //{ + // weight[] + // return (delta * transfer_derivative(out)); + //} + + //float calculate_gradient( float out, float target) + //{ + // + // return (out*target); // I think 1 / e + x + //} + + //void back_propagate(float* data, int length,float* weights, float expected_value, float* gradient, float* delta_weight) + //{ + // float error = 0; + // // loop through and compute the rms not including the bias node + // for (int i = 0; i < length; i++) + // { + // // get the delta between what we predicted vs expected + // float delta = data[i] - expected_value; + + // error = delta * delta; + // } + // error /= length; + // error = std::sqrt(error); + + // // calcuate gradient on input layer? + // // only have one output layer node thing + // for (int n = 4; n < 6; n++) + // { + // // calculate gradient of the layer + // gradient[n] = calculate_gradient( weights[n], error ); + // } + + // //calculate gradient on hidden layer? + + // for (int n = 0; n < 4; n+=2) + // { + // // calculate hidden layer + // calculate_hidden_gradient(weights[n], gradient[6-1-n], data[n]); + // } + + // // update the weights + // update_weights(); + + //} + /** * Example of use case (follow how you did it in stream compaction) */ - /*void scan(int n, int *odata, const int *idata) { + void train(int n, int *data, int expected) { timer().startGpuTimer(); - // TODO + + float weights[6]; + float in_data[4] = { 0,0 }; + float out_data[4]; + float temp[4]; + weights[0] = 10.1; + weights[1] = .9; + weights[2] = 20; + weights[3] = .87; + weights[4] = 41; + weights[5] = -54; + int wt_idx = 0; + + float* dev_buff; + float* dev_wts; + + float* host_buff = new float[4]; + + int rounded_depth = ilog2ceil(NUM_LAYERS); + int rounded_elements = 1 << rounded_depth; + printf("rounded elements %d\n ", rounded_elements); + dim3 fullBlocksPerGrid((rounded_elements + blockSize - 1) / blockSize); + + + + cudaMalloc((void**)&dev_buff, 2 * sizeof(float)); + checkCUDAErrorFn("malloc dev_boolbuff in failed!"); + cudaMemcpy(dev_buff, in_data, 2 * sizeof(float), cudaMemcpyHostToDevice); + checkCUDAErrorFn("dev_in copy failed!"); + + /*cudaMalloc((void**)&dev_out, n * sizeof(int)); + checkCUDAErrorFn("malloc dev_out in failed!"); + cudaMalloc((void**)&dev_in, rounded_elements * sizeof(int)); + checkCUDAErrorFn("malloc dev_in in failed!");*/ + + + // is there a way to place this in memory at compile time? + cudaMalloc((void**)&dev_wts, 6 * sizeof(float)); + checkCUDAErrorFn("malloc dev_in in failed!"); + cudaMemcpy(dev_wts, weights, 6 * sizeof(float), cudaMemcpyHostToDevice); + checkCUDAErrorFn("dev_weights copy failed!"); + + // input nodes + // each wight has an impact on the other nodes + for (int i = NUM_LAYERS; i > 0; i--) + { + //feed_forward(&in_data[0], &temp[0], (float*)&weights[wt_idx], i, INPUT_NODES); + //activate_function(&temp[0], &out_data[0], i ); + kernel_feed_forward << < fullBlocksPerGrid, blockSize >> > (i, dev_buff, &dev_wts[wt_idx]); + checkCUDAErrorFn("feed forward failed!"); + memory_debug_float(i, &dev_buff[0], &host_buff[0]); + kernel_activate << < fullBlocksPerGrid, blockSize >> > (i, dev_buff); + checkCUDAErrorFn("activate failed!"); + memory_debug_float(i, dev_buff, host_buff); + //feed_forward(&out_data[0], &temp[0], (float*)&weights[wt_idx][0], 1,2); + //activate_function(&temp[0], &out_data[0], 1); + //std::swap(in_data, out_data); + wt_idx += 4; // length of array? NUM_NODES* INPUT NODES? + } + + //error = out_data[0] timer().endGpuTimer(); } - */ + + void train_cpu(int n, float *data, float expected) + { + + float weights_layer1[4]; + float weights_layer2[2]; + + weights_layer1[0] = 10.1; + weights_layer1[1] = .9; + weights_layer1[2] = 20; + weights_layer1[3] = .87; + weights_layer2[4] = 41; + weights_layer2[5] = -54; + int wt_idx = 0; + + float* hidden_layer = new float[n]; + float* out_put_layer = new float[n]; + float* temp[NUM_LAYERS + 1]; // pointer to arrays + temp[0] = data; + temp[1] = hidden_layer; + temp[2] = out_put_layer; + + float* temp_weights[NUM_LAYERS]; // pointer to arrays + temp[0] = weights_layer1; + temp[1] = weights_layer2; + + + for (int i = 0; i < NUM_LAYERS; i++) + { + feed_forward(temp[i],temp[i+1], temp_weights[i], NUM_LAYERS-i); + } + + + delete hidden_layer; + delete out_put_layer; + //back_propagate(in_data,NUM_LAYERS+1,&weights[0],exp,gradient + + } // TODO: implement required elements for MLP sections 1 and 2 here } diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h index 2096228..900a552 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.h +++ b/Project2-Character-Recognition/character_recognition/mlp.h @@ -6,4 +6,6 @@ namespace CharacterRecognition { Common::PerformanceTimer& timer(); // TODO: implement required elements for MLP sections 1 and 2 here + void train(int n, int *odata, const int *idata); + void train_cpu(int n, float *data, const float expected); } diff --git a/Project2-Character-Recognition/img/n1.jpg b/Project2-Character-Recognition/img/n1.jpg new file mode 100644 index 0000000..1e1e8aa Binary files /dev/null and b/Project2-Character-Recognition/img/n1.jpg differ diff --git a/Project2-Character-Recognition/img/nn.jpg b/Project2-Character-Recognition/img/nn.jpg new file mode 100644 index 0000000..963c3ba Binary files /dev/null and b/Project2-Character-Recognition/img/nn.jpg differ diff --git a/Project2-Character-Recognition/img/sawzall.gif b/Project2-Character-Recognition/img/sawzall.gif new file mode 100644 index 0000000..ebddf30 Binary files /dev/null and b/Project2-Character-Recognition/img/sawzall.gif differ diff --git a/Project2-Character-Recognition/img/ss.jpg b/Project2-Character-Recognition/img/ss.jpg new file mode 100644 index 0000000..7bfc8b3 Binary files /dev/null and b/Project2-Character-Recognition/img/ss.jpg differ diff --git a/Project2-Character-Recognition/img/yak.jpg b/Project2-Character-Recognition/img/yak.jpg new file mode 100644 index 0000000..d3bc4e2 Binary files /dev/null and b/Project2-Character-Recognition/img/yak.jpg differ diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..8ebb095 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -10,6 +10,14 @@ #include #include #include "testing_helpers.hpp" +#include +#include +#include + +#include /* defines FILENAME_MAX */ + //#define WINDOWS /* uncomment this line to use it for windows.*/ +#include +#define GetCurrentDir _getcwd const int SIZE = 1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two @@ -25,125 +33,161 @@ int main(int argc, char* argv[]) { printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + // genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + // a[SIZE - 1] = 0; + // printArray(SIZE, a, true); + //open_file, + //get_line + // train data + + std::fstream file("../trainingdata.txt"); + std::string str; + if (file.is_open()) { + while (std::getline(file, str)) { + std::cout << str << std::endl; + float* data = new float[str.size() - 1]; + int found = str.find(","); + int next = 0; + while (found != -1) + { + std::string temp(str.begin(), str.begin() + found); + std::cout <<" found at: " << found << '\n'; + data[next] = std::atoi(temp.c_str()); + str.erase(str.begin(), str.begin() + found+1); + found = str.find(","); + next++; + } + + float expected = std::atoi(str.c_str()); + std::cout <<"expected " << expected << std::endl; + std::cout << "input " << data[0] << " " << data[1] << std::endl; + std::cout << "next " << next << std::endl; + + CharacterRecognition::train_cpu(next,data,expected); + + delete data; + } + } + + + + //CharacterRecognition::train_cpu(SIZE, , b); + + // // initialize b using StreamCompaction::CPU::scan you implement + // // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. + // // At first all cases passed because b && c are all zeroes. + // zeroArray(SIZE, b); + // printDesc("cpu scan, power-of-two"); + // StreamCompaction::CPU::scan(SIZE, b, a); + // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(SIZE, b, true); + + // zeroArray(SIZE, c); + // printDesc("cpu scan, non-power-of-two"); + // StreamCompaction::CPU::scan(NPOT, c, a); + // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(NPOT, b, true); + // printCmpResult(NPOT, b, c); + + // zeroArray(SIZE, c); + // printDesc("naive scan, power-of-two"); + // StreamCompaction::Naive::scan(SIZE, c, a); + // printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(SIZE, c, true); + // printCmpResult(SIZE, b, c); + + ///* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan + //onesArray(SIZE, c); + //printDesc("1s array for finding bugs"); + //StreamCompaction::Naive::scan(SIZE, c, a); + //printArray(SIZE, c, true); */ + + // zeroArray(SIZE, c); + // printDesc("naive scan, non-power-of-two"); + // StreamCompaction::Naive::scan(NPOT, c, a); + // printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(SIZE, c, true); + // printCmpResult(NPOT, b, c); + + // zeroArray(SIZE, c); + // printDesc("work-efficient scan, power-of-two"); + // StreamCompaction::Efficient::scan(SIZE, c, a); + // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(SIZE, c, true); + // printCmpResult(SIZE, b, c); + + // zeroArray(SIZE, c); + // printDesc("work-efficient scan, non-power-of-two"); + // StreamCompaction::Efficient::scan(NPOT, c, a); + // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(NPOT, c, true); + // printCmpResult(NPOT, b, c); + + // zeroArray(SIZE, c); + // printDesc("thrust scan, power-of-two"); + // StreamCompaction::Thrust::scan(SIZE, c, a); + // printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(SIZE, c, true); + // printCmpResult(SIZE, b, c); + + // zeroArray(SIZE, c); + // printDesc("thrust scan, non-power-of-two"); + // StreamCompaction::Thrust::scan(NPOT, c, a); + // printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(NPOT, c, true); + // printCmpResult(NPOT, b, c); + + // printf("\n"); + // printf("*****************************\n"); + // printf("** STREAM COMPACTION TESTS **\n"); + // printf("*****************************\n"); + + // // Compaction tests + + // genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + // a[SIZE - 1] = 0; + // printArray(SIZE, a, true); + + // int count, expectedCount, expectedNPOT; + + // // initialize b using StreamCompaction::CPU::compactWithoutScan you implement + // // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. + // zeroArray(SIZE, b); + // printDesc("cpu compact without scan, power-of-two"); + // count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // expectedCount = count; + // printArray(count, b, true); + // printCmpLenResult(count, expectedCount, b, b); + + // zeroArray(SIZE, c); + // printDesc("cpu compact without scan, non-power-of-two"); + // count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // expectedNPOT = count; + // printArray(count, c, true); + // printCmpLenResult(count, expectedNPOT, b, c); + + // zeroArray(SIZE, c); + // printDesc("cpu compact with scan"); + // count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + // printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + // printArray(count, c, true); + // printCmpLenResult(count, expectedCount, b, c); + + // zeroArray(SIZE, c); + // printDesc("work-efficient compact, power-of-two"); + // count = StreamCompaction::Efficient::compact(SIZE, c, a); + // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(count, c, true); + // printCmpLenResult(count, expectedCount, b, c); + + // zeroArray(SIZE, c); + // printDesc("work-efficient compact, non-power-of-two"); + // count = StreamCompaction::Efficient::compact(NPOT, c, a); + // printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + // //printArray(count, c, true); + // printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit delete[] a; diff --git a/Project2-Character-Recognition/trainingdata.txt b/Project2-Character-Recognition/trainingdata.txt new file mode 100644 index 0000000..e6b84b1 --- /dev/null +++ b/Project2-Character-Recognition/trainingdata.txt @@ -0,0 +1,408 @@ +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 +0,0,0 +0,1,1 +1,0,1 +1,1,0 \ No newline at end of file diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md index 0e38ddb..22ec022 100644 --- a/Project2-Stream-Compaction/README.md +++ b/Project2-Stream-Compaction/README.md @@ -3,12 +3,51 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Eric Micallef + * https://www.linkedin.com/in/eric-micallef-99291714b/ + +* Tested on: Windows 10, i5, Nvidia GTX 1660 (Personal) -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +## Overview +![](img/figure-39-2.PNG) + +The illustration above gives us a naive approach to performing the scan algorithm. In this algorithm the gpu does alot of repetitive memory reads and writes but gets to a solution. + +![](img/figure-39-4.PNG) + +The illustration above gives us the work efficient algorithm used. In this manner the gpu performs an upsweep and then a downsweep. +Although this algorithm does less computation than the naive implementation we will see later on that because the threads are not sequentially indexing we run into memory bottlenecks and the algorithm is not as efficient as it could be. + +## Blocksize Analysis + +From the data gathered we can see that the ideal block size is 128 for both Naive and Work Efficient Scans and compact algorithms. +a block size of 32 drastically increased the run time of the algorithms where as more blocks made the algorithms slower. My guess is that because so many threads that are not active at a given time 128 and 64 seem to be a happy balance for hiding some memory latency that occurs. + +![](img/differingblocksraw.PNG) + +![](img/differingblocks.PNG) + +## Performance Analysis + +When comparing the performances of the naive, work efficient and cpu implementations we can see that CPU scans are very quick. once we get to about 32k the compact algorithm on the CPU slows down quite a bit but performing scans on the CPU is significanlty faster than on a GPU. This could be because for the CPU scan we are getting nice sequential read access where as in the work efficient algorithm we are getting random access memory patterns which causes some bottlenecks. + +![](img/smaller_graph.PNG) + +In the graph with the larger arrays we see a similar pattern forming where the cpu compact is terribly slow in comparison to the GPU version. during these higher array sizes the GPU implementation starts to become slightly better than the CPU. + +![](img/larger_graph.PNG) + +## Final Analysis Thoughts + +In all implementations thrust performed poorly. My initial guess is that perhaps thrust always copies the data before it performs its computations so we are seeing a false time for the actual algorithm. + +Although the naive implementation has more work to do we see that the times are not significantly higher than that of the work efficient algorithm. This could be because in the work efficient algorithm we have manyy warps with just a few thread active and the memory patterns are random causing for less latency hiding. It could also be because is a bit more divergence when comparing the naive version to the work efficient version. + +![](img/graph_raw.PNG) +### Test Results + +![](img/moderate_test_result.PNG) + +![](img/large_test_result.PNG) diff --git a/Project2-Stream-Compaction/img/differingblocks.PNG b/Project2-Stream-Compaction/img/differingblocks.PNG new file mode 100644 index 0000000..1370ded Binary files /dev/null and b/Project2-Stream-Compaction/img/differingblocks.PNG differ diff --git a/Project2-Stream-Compaction/img/differingblocksraw.PNG b/Project2-Stream-Compaction/img/differingblocksraw.PNG new file mode 100644 index 0000000..6af81e6 Binary files /dev/null and b/Project2-Stream-Compaction/img/differingblocksraw.PNG differ diff --git a/Project2-Stream-Compaction/img/graph_raw.PNG b/Project2-Stream-Compaction/img/graph_raw.PNG new file mode 100644 index 0000000..05a741b Binary files /dev/null and b/Project2-Stream-Compaction/img/graph_raw.PNG differ diff --git a/Project2-Stream-Compaction/img/large_test_result.PNG b/Project2-Stream-Compaction/img/large_test_result.PNG new file mode 100644 index 0000000..4592d20 Binary files /dev/null and b/Project2-Stream-Compaction/img/large_test_result.PNG differ diff --git a/Project2-Stream-Compaction/img/larger_graph.PNG b/Project2-Stream-Compaction/img/larger_graph.PNG new file mode 100644 index 0000000..27a9276 Binary files /dev/null and b/Project2-Stream-Compaction/img/larger_graph.PNG differ diff --git a/Project2-Stream-Compaction/img/moderate_test_result.PNG b/Project2-Stream-Compaction/img/moderate_test_result.PNG new file mode 100644 index 0000000..f86d6c3 Binary files /dev/null and b/Project2-Stream-Compaction/img/moderate_test_result.PNG differ diff --git a/Project2-Stream-Compaction/img/smaller_graph.PNG b/Project2-Stream-Compaction/img/smaller_graph.PNG new file mode 100644 index 0000000..bc32396 Binary files /dev/null and b/Project2-Stream-Compaction/img/smaller_graph.PNG differ diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index d016553..c115591 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -13,16 +13,20 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 10; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; +int _a[8] = {0, 1, 2, 3, 4, 5, 6, 7}; +int *_b = new int[8]; +int *_c = new int[8]; + int main(int argc, char* argv[]) { // Scan tests - printf("\n"); + printf("TEST SIZE %d NON POW TEST SIZE %d\n",SIZE,NPOT); printf("****************\n"); printf("** SCAN TESTS **\n"); printf("****************\n"); @@ -44,7 +48,7 @@ int main(int argc, char* argv[]) { printDesc("cpu scan, non-power-of-two"); StreamCompaction::CPU::scan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); + //printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -115,7 +119,7 @@ int main(int argc, char* argv[]) { count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedCount = count; - printArray(count, b, true); + //printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); zeroArray(SIZE, c); @@ -123,14 +127,14 @@ int main(int argc, char* argv[]) { count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedNPOT = count; - printArray(count, c, true); + //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); printDesc("cpu compact with scan"); count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); + // printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..185a604 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_75 ) diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu index 2ed6d63..417bc31 100644 --- a/Project2-Stream-Compaction/stream_compaction/common.cu +++ b/Project2-Stream-Compaction/stream_compaction/common.cu @@ -22,8 +22,29 @@ namespace StreamCompaction { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + __global__ void kernMapToBoolean(int n, int *bools, const int *idata) + { // TODO + + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (tid >= n) + { + return; + } + + bools[tid] = (idata[tid] != 0) ? 1 : 0; + +// if(idata[tid] != 0) +// { +// bools[tid] = 1; +// } +// else +// { +// bools[tid] = 0; +// } + + return; } /** @@ -31,8 +52,22 @@ namespace StreamCompaction { * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { + const int *idata, const int *bools, const int *indices) + { // TODO + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (tid >= n) + { + return; + } + + if( bools[tid] == 1) + { + odata[indices[tid]] = idata[tid]; + } + return; + } } diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..ac28246 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -11,7 +11,21 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - + + // since used in compact scan + static void __scan( int n, int* odata, const int *idata ) + { + //from notes: + // in [3,1,4,7 ,0,4,1,6,3] + // out[0,3,4,8,15,15,19,20,26] + // itr1 odata[1] = idata[0] + odata[0]; 3 + // itr2 odata[2] = idata[1] + odata[1] + odata[0] = 0; + for(int i = 1; i < n; i++) + { + odata[i] = idata[i-1] + odata[i-1]; + } + } /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. @@ -19,7 +33,9 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // TODO + __scan( n, odata, idata ); + timer().endCpuTimer(); } @@ -30,9 +46,22 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + // TODO + //from notes: + // in [3,0,4,0,0,4,0,6,0] + // out[3,4,4,6] or [3,7,11,17]? + odata[0] = 0; + int writer = 0; + for(int i = 0; i < n; i++) + { + if( idata[i] != 0 ) + { + odata[writer] = idata[i]; + writer++; + } + } timer().endCpuTimer(); - return -1; + return writer; } /** @@ -40,11 +69,63 @@ namespace StreamCompaction { * * @returns the number of elements remaining after compaction. */ + // in [3,0,4,0,0,4,0,6,0] + // create [1,0,1,0,0,1,0,1,0] + // after scan[1,1,2,2,2,3,3,4,4] // 4 elements these are the indexes to where the data is + // scatter? + //Result of scan is index into final array + //Only write an element if temporary array has a 1 + //Write index is given by scan + //scatter out [3,4,4,6] return 4 int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + // create the 1,0 buffer + int* buff = new int[n]; + int i = 0; + int rval = 0; + + // have [3,0,4,0,0,4,0,6,0] + // create [1,0,1,0,0,1,0,1,0] + for(i = 0; i < n; i++) + { + if(idata[i] != 0) + { + buff[i] = 1; + } + else + { + buff[i] = 0; + } + } + // scan this buffer now to figure out the indexes + // have [1,0,1,0,0,1,0,1,0] + // create scan[1,1,2,2,2,3,3,4,4] + int* scan_buff = new int[n]; + __scan( n, scan_buff, buff ); // #el, out, in + + // have scan[1,1,2,2,2,3,3,4,4] index to where we should place output + // have input[3,0,4,0,0,4,0,6,0] + //create[3,4,4,6] + for(i = 0; i < n; i++) + { + //printf("buff[%d] = %d ", i, buff[i]); + //printf("scan_buff[%d] = %d ", i, scan_buff[i]); + //printf("in[%d] = %d\n", i, idata[i]); + if(buff[i] == 1) // marked as data + { + odata[scan_buff[i]] = idata[i]; + rval = scan_buff[i]; // how many elements do we have + } + } + + + timer().endCpuTimer(); - return -1; + // + delete [] scan_buff; + delete [] buff; + return (rval +1); } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..73bfd8a 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +14,179 @@ namespace StreamCompaction { return timer; } + void memory_debug(int elements, int* cuda_mem, int* cpu_mem, const int* cpu_in) + { + cudaMemcpy(cpu_mem, cuda_mem, elements * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("copy out failed!"); + printf("=============================\n"); + for (int i = 0; i < elements; i++) + { + printf("out[%d] %d ", i, cpu_mem[i]); + printf("in[%d] %d\n", i, cpu_in[i]); + } + printf("=============================\n"); + } + + __global__ void kernel_inclusive_to_exclusive(int bufflength, int* idata, int* odata, int* inc_byte) + { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (tid >= bufflength) + return; + + if (tid == 0) { + odata[tid] = 0; + return; + } + + if (tid == bufflength - 1) { + odata[tid] = idata[tid] + inc_byte[0]; + return; + } + // shift one + odata[tid] = idata[tid]; + return; + } + + + + // according to notes we need to padd with zeros to accomodate not + // perfect logs. + __global__ void kernel_padd_0s(int* idata,int bufflength,int padded_length) + { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (tid > bufflength && tid < padded_length) + idata[tid] = 0; + + return; + } + + /* + * perform and upsweep + * from the notes that means + * fir d=0 to log2n-1 + * for k =0; to n-1 by 2^(d+1) in parallel + * x[k+(2^d+1) -1] += x[k+(2^d)-1] // so we need power and power plus one + */ + __global__ void kernel_upsweep(int bufflength, int* data, int power, int power_plus1) + { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (tid >= bufflength) + return; + // for depth = 0 + //in[0,1,2,3,4,5,6,7] + //sums[x,(0+1),x,(2+3),x(4+5),x,(6+7)] + //move down [0,x,2,x,4,x,6,x] + // want threads [1,3,5,6] to compute something + if ( ( tid % power_plus1 ) == 0) + { + data[tid+power_plus1-1] += data[tid+power-1]; + } + } + + /* + * perform a downsweep after an upsweep + * for[0,1,2,3,4,5,6,7] + * after upsweep [0,1,2,6,4,9,6,28] + * from notes + * for d = log2n-1 to 0 + * for all k = 0 to n-1 by 2^(d+1) in par + * t = x[k+(2^d)-1] + * x[k+(2^d)-1] = x[k+(2^d+1) -1] + * x[k+(2^d+1)-1] += t + */ + __global__ void kernel_downsweep(int bufflength, int* data, int power, int power_plus1) + { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (tid >= bufflength) + return; + //for depth = 0 or last dpeth on upsweep + // for[0,1,2,6,4,9,6,28] + // first set last to 0 + // [0,1,2,6,4,9,6,0] set initially last element to 0 + // now want[0,1,2,0,4,9,6,6] ( [7-4] + [7] ) then set 7-4 to 0 + if ((tid % power_plus1) == 0) + { + int old = data[tid + power - 1]; + data[tid + power - 1] = data[tid + power_plus1 - 1]; + data[tid + power_plus1 - 1] += old; + } + } + + void dev_scan(int elements,int rounded_depth,int rounded_elements,int* dev_data,dim3 blocks) + { + int pow = 0; + int powplus1 = 0; + int byte[1] = { 0 }; + + for (int i = 0; i <= rounded_depth - 1; i++) + { + pow = (1 << i); + powplus1 = (1 << (i + 1)); + //printf("i %d -> depth %d \n ", other_pow, pow2); + kernel_upsweep << < blocks, blockSize >> > (elements, dev_data, pow, powplus1); + checkCUDAErrorFn("up sweep failed!"); + } + + + //memory_debug(n, dev_data, odata, idata); + + // write one single byte to the LAST entry ... even if it was rounded and you just padded + cudaMemcpy(&dev_data[rounded_elements - 1], &byte[0], sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("copy last byte failed!"); + //printf("write last byte\n"); + //memory_debug(n, dev_data, odata, idata); + //printf("starting downsweep\n"); + for (int i = rounded_depth - 1; i >= 0; i--) + { + pow = (1 << (i)); + powplus1 = (1 << (i + 1)); + kernel_downsweep << < blocks, blockSize >> > (rounded_elements, dev_data, pow, powplus1); + checkCUDAErrorFn("down sweep failed!"); + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } + void scan(int n, int *odata, const int *idata) { + + int* dev_data; + int pow = 0; + int byte[1] = { 0 }; + + // if we have 257 elements we need to account for element 257 + // so we have to do an extra loop log2size will be 512 in this case + int rounded_depth = ilog2ceil(n); + int rounded_elements = 1 << rounded_depth; + dim3 fullBlocksPerGrid((rounded_elements + blockSize - 1) / blockSize); + + // need a slightly bigger buffer since if we have 257 elements well go up to + // iteration 512 + cudaMalloc((void**)&dev_data, rounded_elements * sizeof(int)); + checkCUDAErrorFn("malloc temp in failed!"); + + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("scan idata copy failed!"); + + // pad if we need to + kernel_padd_0s << < fullBlocksPerGrid, blockSize >> > (dev_data, n, rounded_elements); + + timer().startGpuTimer(); + // run the actual work efficient algorithm + dev_scan(n, rounded_depth, rounded_elements, dev_data, fullBlocksPerGrid); + + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("copy out failed!"); + + cudaFree(dev_data); + checkCUDAErrorFn("free input failed!"); + } /** * Performs stream compaction on idata, storing the result into odata. @@ -31,10 +198,84 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO - timer().endGpuTimer(); - return -1; + int* dev_boolbuff; + int* dev_map; + int* dev_out; + int* dev_in; + + int rounded_depth = ilog2ceil(n); + int rounded_elements = 1 << rounded_depth; + dim3 fullBlocksPerGrid((rounded_elements + blockSize - 1) / blockSize); + + //int last_in = idata[] + // need a slightly bigger buffer since if we have 257 elements well go up to + // iteration 512 + cudaMalloc((void**)&dev_boolbuff, rounded_elements * sizeof(int)); + checkCUDAErrorFn("malloc dev_boolbuff in failed!"); + cudaMalloc((void**)&dev_map, rounded_elements * sizeof(int)); + checkCUDAErrorFn("malloc dev_map in failed!"); + cudaMalloc((void**)&dev_out, n * sizeof(int)); + checkCUDAErrorFn("malloc dev_out in failed!"); + cudaMalloc((void**)&dev_in, rounded_elements * sizeof(int)); + checkCUDAErrorFn("malloc dev_in in failed!"); + + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("dev_in copy failed!"); + + // pad if we need to + kernel_padd_0s << < fullBlocksPerGrid, blockSize >> > (dev_in, n, rounded_elements); + + + timer().startGpuTimer(); + // stores 1s and zeros in the boolbuffer + // have [3,0,4,0,0,4,0,6,0] + // create [1,0,1,0,0,1,0,1,0] + StreamCompaction::Common::kernMapToBoolean << < fullBlocksPerGrid, blockSize >> > (n, dev_boolbuff, dev_in); + + // need to retain this bool data for the kernscatter + cudaMemcpy(dev_map, dev_boolbuff, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAErrorFn("dev_map copy failed!"); + + // dev scan takes in device buffers create our map + // have [1,0,1,0,0,1,0,1,0] + // create scan[1,1,2,2,2,3,3,4,4] + dev_scan(n,rounded_depth,rounded_elements, dev_map, fullBlocksPerGrid); + + // have scan[1,1,2,2,2,3,3,4,4] index to where we should place output + // have input[3,0,4,0,0,4,0,6,0] + //API calls for: + // device output buffer + // device original input + // device array of bools + // device map created by scan + StreamCompaction::Common::kernScatter << < fullBlocksPerGrid, blockSize >> > (n, dev_out, dev_in, dev_boolbuff, dev_map); + + + timer().endGpuTimer(); + // we need to read the last elements from our map and our the bool buff. + // the map will tell us how many elements but is an exclusive scan so + // we need to read the last element of the bool array to see if it contains a 1 or 0 + int last_bool; + int last_map; + int scatter_size = 0; + + cudaMemcpy(&last_bool, &dev_boolbuff[n-1], sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("copy last_bool failed!"); + cudaMemcpy(&last_map, &dev_map[n-1], sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("copy last_map failed!"); + + scatter_size = last_bool + last_map; + + cudaMemcpy(odata, dev_out, scatter_size * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("copy out failed!"); + + cudaFree(dev_boolbuff); + cudaFree(dev_map); + cudaFree(dev_out); + cudaFree(dev_in); + + return scatter_size; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..7b37439 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#define blockSize 128 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -13,13 +15,106 @@ namespace StreamCompaction { } // TODO: __global__ + __global__ void kernel_inclusive_to_exclusive(int buff_length, int* odata, int* idata) + { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tid >= buff_length) + { + return; + } + + if (tid == 0) + { + odata[tid] = 0; + } + else + { + odata[tid] = idata[tid - 1]; + } + + + } + + // out put and two input buffers to ping pong off of + // + __global__ void kernel_scan( int pow, int buff_length, int depth, int* odata, int* idata ) + { + int tid = (blockIdx.x * blockDim.x) + threadIdx.x; + + // get rid of any stray threads + if( tid >= buff_length ) + { + // __syncthreads(); + return; + } + + // exclusive scan will always lead to a race condition with the 0th element + // lets try making inclusive and Then exclusive. + if( tid < pow ) // already been computed + { + // __syncthreads(); // need this or will lock ): + odata[tid] = idata[tid]; + return; + } + //the original way ... but this wont work because we have threads manipulating the neighbors + //odata[tid] = idata[tid-1] + odata[tid-1]; + int prev = idata[tid-pow]; + // read your neighbors and wait + // __syncthreads(); + // now we can write as before. + odata[tid] = idata[tid] + prev; + return; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO - timer().endGpuTimer(); + int* dev_temp_in; + int* dev_input; + dim3 fullBlocksPerGrid = ((n + blockSize - 1) / blockSize); + //create cuda buffers and copy data over + cudaMalloc((void**)&dev_temp_in, n * sizeof(int)); + checkCUDAErrorFn("malloc temp in failed!"); + cudaMalloc((void**)&dev_input, n * sizeof(int)); + checkCUDAErrorFn("malloc devinput failed!"); + // copy data to device n or n*size? check + cudaMemcpy( dev_input, idata, n * sizeof(int), cudaMemcpyHostToDevice ); + checkCUDAErrorFn("copy failed!"); + cudaMemcpy( dev_temp_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("copy failed!"); + + + int depth = ilog2ceil(n); + + timer().startGpuTimer(); + + // think this itr count needs to be changed + for(int i = 1; i <= depth; i++) + { + int pow2 = (1 << (i - 1)); + //printf("i %d -> depth %d \n ", i, pow2); + kernel_scan<<< fullBlocksPerGrid, blockSize >>>(pow2,n,i,dev_temp_in,dev_input); + checkCUDAErrorFn("scan failed!"); + std::swap(dev_temp_in,dev_input); + checkCUDAErrorFn("swap failed!"); + } + + cudaMemcpy(odata, dev_temp_in, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorFn("copy out failed!"); + + kernel_inclusive_to_exclusive<<< fullBlocksPerGrid, blockSize >> > (n,dev_temp_in, dev_input); + + timer().endGpuTimer(); + + cudaMemcpy( odata, dev_temp_in, n * sizeof(int), cudaMemcpyDeviceToHost ); + checkCUDAErrorFn("copy out failed!"); + + cudaFree(dev_input); + checkCUDAErrorFn("free input failed!"); + cudaFree(dev_temp_in); + checkCUDAErrorFn("free temp failed!"); } } } diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..68c6523 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -18,11 +18,17 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + + thrust::device_vector in(idata, idata+n); + thrust::device_vector out(n); + + //thrust::exclusive_scan(in.begin(), in.end(), out.begin()); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + timer().startGpuTimer(); + thrust::exclusive_scan(in.begin(),in.end(),out.begin()); + timer().endGpuTimer(); + thrust::copy(out.begin(), out.end(), odata); } } } diff --git a/README.md b/README.md index 3a0b2fe..b54dfe6 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,9 @@ -CUDA Number Algorithms +CUDA Stream Compaction ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -Link to the readmes of the other two subprojects. - -Add anything else you think is relevant up to this point. -(Remember, this is public, so don't put anything here that you don't want to share with the world.) - +* Eric Micallef + * https://www.linkedin.com/in/eric-micallef-99291714b/ + +* Tested on: Windows 10, i5, Nvidia GTX1660 (Personal)