Skip to content

Project 2: Davis Polito #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .README.md.un~
Binary file not shown.
Binary file added .vs/Project2-Number-Algorithms/v15/.suo
Binary file not shown.
Binary file added .vs/Project2-Number-Algorithms/v15/Browse.VC.db
Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 3 additions & 0 deletions .vs/ProjectSettings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"CurrentProjectSetting": "x64-Debug (default)"
}
10 changes: 10 additions & 0 deletions .vs/VSWorkspaceState.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"ExpandedNodes": [
"",
"\\Project2-Stream-Compaction",
"\\Project2-Stream-Compaction\\src",
"\\Project2-Stream-Compaction\\stream_compaction"
],
"SelectedNode": "\\Project2-Stream-Compaction\\stream_compaction\\CMakeLists.txt",
"PreviewInSolutionExplorer": false
}
Binary file added .vs/slnx.sqlite
Binary file not shown.
3 changes: 3 additions & 0 deletions Project2-Character-Recognition/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
endif()

include_directories(.)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
add_subdirectory(character_recognition)

cuda_add_executable(${CMAKE_PROJECT_NAME}
Expand All @@ -30,6 +31,8 @@ cuda_add_executable(${CMAKE_PROJECT_NAME}
)

target_link_libraries(${CMAKE_PROJECT_NAME}
cublas
curand
character_recognition
${CORELIBS}
)
12 changes: 1 addition & 11 deletions Project2-Character-Recognition/README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,4 @@
CUDA Character Recognition
======================

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)

### (TODO: Your README)

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
attempted to implement, never truly understood what I am doing and could not figure out what I am suppose to do

Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ set(SOURCE_FILES

cuda_add_library(character_recognition
${SOURCE_FILES}
OPTIONS -arch=sm_20
OPTIONS -arch=sm_61
)
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "common.h"

#include "cublas.h"
void checkCUDAErrorFn(const char *msg, const char *file, int line) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess == err) {
Expand Down
84 changes: 84 additions & 0 deletions Project2-Character-Recognition/character_recognition/mlp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,90 @@
#include <cuda_runtime.h>
#include "common.h"
#include "mlp.h"
#include <cublas_v2.h>
#define blockSize 32
//https://solarianprogrammer.com/2012/05/31/matrix-multiplication-cuda-cublas-curand-thrust/
// m = number of rows in A
// k = number of columns in A
// n = number of columns in B
void gpu_blas_mmul(const float *A, const float *B, float *C, const int m, const int k, const int n) {
int lda = m, ldb = k, ldc = m;
const float alf = 1;
const float bet = 0;
const float *alpha = &alf;
const float *beta = &bet;
// Create a handle for CUBLAS
cublasHandle_t handle;
cublasCreate(&handle);

// Do the actual multiplication
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);

// Destroy the handle
cublasDestroy(handle);

}

__global__ void activation_rule(int n, float const *idata, float* odata) {
int index = (blockDim.x * blockIdx.x) + threadIdx.x;
if (index >= n) {
return;
}

odata[index] = 1.0 / (1.0 + std::exp(-idata[index]));
}

__global__ void activation_rule_d(int n, float const *idat, float* odata) {
int index = (blockDim.x * blockIdx.x) + threadIdx.x;
if (index >= n) {
return;
}
odata[index] = idat[index] * (1 - idat[index]);

}
__global__ void matrixSub(int n, const float* a, const float* b, float* odata) {

int index = (blockDim.x * blockIdx.x) + threadIdx.x;
if (index >= n) {
return;
}
odata[index] = a[index] - b[index];
}


void train(const float *A_input, const int A_m, const int A_n, float* weights_into_layer, float* weights_out_layer, float* predicted_output,
float* err, float* layer, const int layer_n, const float* expected_output, const int expected_output_n, float* post_activation_layer) {
float *dev_input, *dev_weights_0, *dev_weights_1, *dev_error, *dev_layer, *dev_output, *dev_activation_layer, *dev_expected_output;
cudaMalloc((void**)&dev_input, sizeof(float) * A_m * A_n);
cudaMemcpy(dev_input, A_input, A_m * A_n * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_weights_0, A_m * layer_n * sizeof(float));
cudaMemcpy(dev_weights_0, weights_into_layer, A_m * layer_n * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_weights_1, layer_n * sizeof(float));
cudaMemcpy(dev_weights_1, weights_out_layer, layer_n * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_layer, sizeof(float) * layer_n);
cudaMemcpy(dev_layer, layer, layer_n * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_activation_layer, sizeof(float) * layer_n);
cudaMalloc((void**)&dev_error, expected_output_n * sizeof(float));
cudaMalloc((void**)&dev_output, expected_output_n * sizeof(float));
cudaMalloc((void**)&dev_expected_output, expected_output_n * sizeof(float));
cudaMemcpy(dev_expected_output, expected_output, expected_output_n* sizeof(float), cudaMemcpyHostToDevice);
float* dev_subtraction, *dev_derivative, *dev_output_d;
cudaMalloc((void**)&dev_subtraction, expected_output_n * sizeof(float));
cudaMalloc((void**)&dev_derivative, expected_output_n * sizeof(float));
cudaMalloc((void**)&dev_output_d, expected_output_n * sizeof(float));
for (int i = 0; i < 50; i++) {
gpu_blas_mmul(dev_input, dev_weights_1, dev_layer, 1, A_n, layer_n);
activation_rule<<<((layer_n + blockSize - 1) / blockSize), blockSize>>>(layer_n, dev_layer, dev_activation_layer);
gpu_blas_mmul(dev_activation_layer, dev_weights_1, dev_output, 1, layer_n, A_n);
//compute derivate. derivative of sigmoid is out * expected-out
matrixSub << <((layer_n + blockSize - 1) / blockSize), blockSize >> > (dev_expected_output, dev_output, dev_subtraction);
activation_rule_d << <((layer_n + blockSize - 1) / blockSize), blockSize >> > (layer_n, dev_output, dev_derivative);
gpu_blas_mmul(dev_subtraction, dev_derivative, dev_output_d, 1, expected_output_n, expected_output_n);

}

}
__global__ void runTrain(const float *A_input, const int A_m, const int A_n, float* weights_into_layer, float* weights_out_layer, const float* predicted_output, float* err, float* layer, const int layer_n, const float* expected_output, const int expected_output_n) {

namespace CharacterRecognition {
using Common::PerformanceTimer;
Expand Down
137 changes: 0 additions & 137 deletions Project2-Character-Recognition/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,142 +11,5 @@
#include <character_recognition/common.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
int *c = new int[SIZE];

int main(int argc, char* argv[]) {
// Scan tests

printf("\n");
printf("****************\n");
printf("** SCAN TESTS **\n");
printf("****************\n");

genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

// initialize b using StreamCompaction::CPU::scan you implement
// We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
// At first all cases passed because b && c are all zeroes.
zeroArray(SIZE, b);
printDesc("cpu scan, power-of-two");
StreamCompaction::CPU::scan(SIZE, b, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(SIZE, b, true);

zeroArray(SIZE, c);
printDesc("cpu scan, non-power-of-two");
StreamCompaction::CPU::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(NPOT, b, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
onesArray(SIZE, c);
printDesc("1s array for finding bugs");
StreamCompaction::Naive::scan(SIZE, c, a);
printArray(SIZE, c, true); */

zeroArray(SIZE, c);
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
StreamCompaction::Thrust::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

printf("\n");
printf("*****************************\n");
printf("** STREAM COMPACTION TESTS **\n");
printf("*****************************\n");

// Compaction tests

genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

int count, expectedCount, expectedNPOT;

// initialize b using StreamCompaction::CPU::compactWithoutScan you implement
// We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
zeroArray(SIZE, b);
printDesc("cpu compact without scan, power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
expectedCount = count;
printArray(count, b, true);
printCmpLenResult(count, expectedCount, b, b);

zeroArray(SIZE, c);
printDesc("cpu compact without scan, non-power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
expectedNPOT = count;
printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

zeroArray(SIZE, c);
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

system("pause"); // stop Win32 console from closing on exit
delete[] a;
delete[] b;
delete[] c;
}
Binary file added Project2-Stream-Compaction/img/blocksizeopt.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Project2-Stream-Compaction/img/outputConsole.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Project2-Stream-Compaction/img/sizevstime.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions Project2-Stream-Compaction/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 15; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down Expand Up @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
Expand Down
3 changes: 1 addition & 2 deletions Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,4 @@ set(SOURCE_FILES

cuda_add_library(stream_compaction
${SOURCE_FILES}
OPTIONS -arch=sm_20
)
OPTIONS -arch=sm_61 )
14 changes: 14 additions & 0 deletions Project2-Stream-Compaction/stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = (blockDim.x * blockIdx.x) + threadIdx.x;
if (index >= n) {
return;
}

bools[index] = (idata[index] != 0) ? 1 : 0;
}

/**
Expand All @@ -33,6 +39,14 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
if (idx >= n) {
return;
}

if (bools[idx]) {
odata[indices[idx]] = idata[idx];
}
}

}
Expand Down
3 changes: 2 additions & 1 deletion Project2-Stream-Compaction/stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include <algorithm>
#include <chrono>
#include <stdexcept>

#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)

Expand Down Expand Up @@ -92,6 +91,8 @@ namespace StreamCompaction {
cudaEventRecord(event_end);
cudaEventSynchronize(event_end);

#include "common.h"

if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }

cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
Expand Down
Loading