CIS565-Fall-2019 · davispolito · Sep 16, 2019 · Sep 18, 2019 · Sep 18, 2019 · Sep 18, 2019
diff --git a/.README.md.un~ b/.README.md.un~
diff --git a/.vs/Project2-Number-Algorithms/v15/.suo b/.vs/Project2-Number-Algorithms/v15/.suo
diff --git a/.vs/Project2-Number-Algorithms/v15/Browse.VC.db b/.vs/Project2-Number-Algorithms/v15/Browse.VC.db
diff --git a/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/69bca779ad2e221c/MAIN.ipch b/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/69bca779ad2e221c/MAIN.ipch
diff --git a/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/f38087074acb9f8b/CPU.ipch b/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/f38087074acb9f8b/CPU.ipch
diff --git a/.vs/ProjectSettings.json b/.vs/ProjectSettings.json
@@ -0,0 +1,3 @@
+{
+  "CurrentProjectSetting": "x64-Debug (default)"
+}
diff --git a/.vs/VSWorkspaceState.json b/.vs/VSWorkspaceState.json
@@ -0,0 +1,10 @@
+{
+  "ExpandedNodes": [
+    "",
+    "\\Project2-Stream-Compaction",
+    "\\Project2-Stream-Compaction\\src",
+    "\\Project2-Stream-Compaction\\stream_compaction"
+  ],
+  "SelectedNode": "\\Project2-Stream-Compaction\\stream_compaction\\CMakeLists.txt",
+  "PreviewInSolutionExplorer": false
+}
diff --git a/.vs/slnx.sqlite b/.vs/slnx.sqlite
diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt
@@ -22,6 +22,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 endif()
 
 include_directories(.)
+link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
 add_subdirectory(character_recognition)
 
 cuda_add_executable(${CMAKE_PROJECT_NAME}
@@ -30,6 +31,8 @@ cuda_add_executable(${CMAKE_PROJECT_NAME}
     )
 
 target_link_libraries(${CMAKE_PROJECT_NAME}
+    cublas
+	curand
     character_recognition
     ${CORELIBS}
     )
diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md
@@ -1,14 +1,4 @@
 CUDA Character Recognition
 ======================
-
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
-
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+attempted to implement, never truly understood what I am doing and could not figure out what I am suppose to do
 
diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
@@ -7,5 +7,5 @@ set(SOURCE_FILES
 
 cuda_add_library(character_recognition
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/Project2-Character-Recognition/character_recognition/common.cu b/Project2-Character-Recognition/character_recognition/common.cu
@@ -1,5 +1,5 @@
 #include "common.h"
-
+#include "cublas.h"
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess == err) {

diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu
@@ -2,6 +2,90 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "mlp.h"
+#include <cublas_v2.h>
+#define blockSize 32
+//https://solarianprogrammer.com/2012/05/31/matrix-multiplication-cuda-cublas-curand-thrust/
+// m =  number of rows in A
+// k = number of columns in A
+// n = number of columns in B
+void gpu_blas_mmul(const float *A, const float *B, float *C, const int m, const int k, const int n) {
+	int lda = m, ldb = k, ldc = m;
+	const float alf = 1;
+    const float bet = 0;
+    const float *alpha = &alf;
+    const float *beta = &bet;
+    // Create a handle for CUBLAS
+	 cublasHandle_t handle;
+     cublasCreate(&handle);
+
+    // Do the actual multiplication
+    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+
+	// Destroy the handle
+    cublasDestroy(handle);
+
+}
+
+__global__ void activation_rule(int n, float const *idata, float* odata) {
+	int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (index >= n) {
+		return;
+	}
+
+	odata[index] = 1.0 / (1.0 + std::exp(-idata[index]));
+}
+
+__global__ void activation_rule_d(int n, float const *idat, float* odata) {
+	int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (index >= n) {
+		return;
+	}
+	odata[index] = idat[index] * (1 - idat[index]);
+
+}
+__global__  void matrixSub(int n, const float* a, const float* b, float* odata) {
+
+	int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (index >= n) {
+		return;
+	}
+	odata[index] = a[index] - b[index];
+}
+
+
+void train(const float *A_input, const int A_m, const int A_n, float* weights_into_layer, float* weights_out_layer, float* predicted_output, 
+	float* err, float* layer, const int layer_n, const float* expected_output, const int expected_output_n, float* post_activation_layer) {
+	float *dev_input, *dev_weights_0, *dev_weights_1, *dev_error, *dev_layer, *dev_output, *dev_activation_layer, *dev_expected_output;
+	cudaMalloc((void**)&dev_input, sizeof(float) * A_m * A_n);
+	cudaMemcpy(dev_input, A_input, A_m * A_n * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc((void**)&dev_weights_0, A_m * layer_n * sizeof(float));
+	cudaMemcpy(dev_weights_0, weights_into_layer, A_m * layer_n * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc((void**)&dev_weights_1, layer_n * sizeof(float));
+	cudaMemcpy(dev_weights_1, weights_out_layer, layer_n * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc((void**)&dev_layer, sizeof(float) * layer_n);
+	cudaMemcpy(dev_layer, layer, layer_n * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc((void**)&dev_activation_layer, sizeof(float) * layer_n);
+	cudaMalloc((void**)&dev_error, expected_output_n * sizeof(float));
+	cudaMalloc((void**)&dev_output, expected_output_n * sizeof(float));
+	cudaMalloc((void**)&dev_expected_output, expected_output_n * sizeof(float));
+	cudaMemcpy(dev_expected_output, expected_output, expected_output_n* sizeof(float), cudaMemcpyHostToDevice);
+	float* dev_subtraction, *dev_derivative, *dev_output_d;
+	cudaMalloc((void**)&dev_subtraction, expected_output_n * sizeof(float));
+	cudaMalloc((void**)&dev_derivative, expected_output_n * sizeof(float));
+	cudaMalloc((void**)&dev_output_d, expected_output_n * sizeof(float));
+	for (int i = 0; i < 50; i++) {
+		gpu_blas_mmul(dev_input, dev_weights_1, dev_layer, 1, A_n, layer_n);
+		activation_rule<<<((layer_n + blockSize - 1) / blockSize), blockSize>>>(layer_n, dev_layer, dev_activation_layer);
+		gpu_blas_mmul(dev_activation_layer, dev_weights_1, dev_output, 1, layer_n, A_n);
+		//compute derivate. derivative of sigmoid is out * expected-out
+		matrixSub << <((layer_n + blockSize - 1) / blockSize), blockSize >> > (dev_expected_output, dev_output, dev_subtraction);
+		activation_rule_d << <((layer_n + blockSize - 1) / blockSize), blockSize >> > (layer_n, dev_output, dev_derivative);
+		gpu_blas_mmul(dev_subtraction, dev_derivative, dev_output_d, 1, expected_output_n, expected_output_n);
+
+	}
+
+}
+__global__ void runTrain(const float *A_input, const int A_m, const int A_n, float* weights_into_layer, float* weights_out_layer, const float* predicted_output, float* err, float* layer, const int layer_n, const float* expected_output, const int expected_output_n) {
 
 namespace CharacterRecognition {
     using Common::PerformanceTimer;

diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp
@@ -11,142 +11,5 @@
 #include <character_recognition/common.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
-
 int main(int argc, char* argv[]) {
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    // initialize b using StreamCompaction::CPU::scan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
-    // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-	onesArray(SIZE, c);
-	printDesc("1s array for finding bugs");
-	StreamCompaction::Naive::scan(SIZE, c, a);
-	printArray(SIZE, c, true); */
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    system("pause"); // stop Win32 console from closing on exit
-	delete[] a;
-	delete[] b;
-	delete[] c;
 }
diff --git a/Project2-Stream-Compaction/img/blocksizeopt.PNG b/Project2-Stream-Compaction/img/blocksizeopt.PNG
diff --git a/Project2-Stream-Compaction/img/outputConsole.PNG b/Project2-Stream-Compaction/img/outputConsole.PNG
diff --git a/Project2-Stream-Compaction/img/sizevstime.PNG b/Project2-Stream-Compaction/img/sizevstime.PNG
diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 15; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
 	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan

diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
@@ -13,5 +13,4 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
-    )
+    OPTIONS -arch=sm_61    )
diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu
@@ -24,6 +24,12 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+			int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+
+			bools[index] = (idata[index] != 0) ? 1 : 0;
         }
 
         /**
@@ -33,6 +39,14 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+			int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
+			if (idx >= n) {
+				return;
+			}
+
+			if (bools[idx]) {
+				odata[indices[idx]] = idata[idx];
+			}
         }
 
     }

diff --git a/Project2-Stream-Compaction/stream_compaction/common.h b/Project2-Stream-Compaction/stream_compaction/common.h
@@ -9,7 +9,6 @@
 #include <algorithm>
 #include <chrono>
 #include <stdexcept>
-
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
@@ -92,6 +91,8 @@ namespace StreamCompaction {
 			    cudaEventRecord(event_end);
 			    cudaEventSynchronize(event_end);
 
+#include "common.h"
+
 			    if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
 
 			    cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);