diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt
index 09e9198..1caf239 100644
--- a/Project2-Character-Recognition/CMakeLists.txt
+++ b/Project2-Character-Recognition/CMakeLists.txt
@@ -22,6 +22,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 endif()
 
 include_directories(.)
+link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
 add_subdirectory(character_recognition)
 
 cuda_add_executable(${CMAKE_PROJECT_NAME}
@@ -32,4 +33,5 @@ cuda_add_executable(${CMAKE_PROJECT_NAME}
 target_link_libraries(${CMAKE_PROJECT_NAME}
     character_recognition
     ${CORELIBS}
+    cublas
     )
diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md
index 4503fac..9664210 100644
--- a/Project2-Character-Recognition/README.md
+++ b/Project2-Character-Recognition/README.md
@@ -3,12 +3,158 @@ CUDA Character Recognition
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zheyuan Xie
+* Tested on: Windows 10 Pro, i7-7700HQ @ 2.80GHz 2.80GHz, 16GB, GTX 1050 2GB (Dell XPS 15 9560)
 
-### (TODO: Your README)
+## Description
+In this project, a three-layer perceptron is implemented using CUDA.The implementation is tested on the XOR example and the given character recognition dataset. 
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+The three-layer perceptron is warpped into a class `MLP3`, which is declared in `mlp.h` and implemented in `mlp.cu`. Related CUDA kernels and cuBLAS helper functions are implemented in `mlp.cu`. The neural network is composed of three layers:
+ - An input layer
+ - A fully connected hidden layer with Sigmoid activation
+ - A fully connected output layer with Sigmoid activation
 
+Currently we do not support changing the number of layers. But the number of neurons (N_input, N_hidden, N_output) in each layer is configurable at consturction. The training data and labels are provided in batch as N_batch x N_input and N_batch x N_ouptut matrix stored in *column-major* arrays. The training undergoes the following steps on every epoch:
+ 
+ 1. **Forward pass**: given inputs, compute the outputs of the network using current weights.
+ 2. **Backward propagation**: back propagate the training error and compute the gradients of each weight.
+ 3. **Calculate Loss**: compute the total training loss of current batch.
+ 4. **Update weights**: update the gradient using naive gradient descent.
+
+ All calculations are implemented in parallel using custom CUDA kernels and cuCLABS library. Since the training process is very fast, the weights are not saved to any file. To test the implementation we can re-train the network within seconds.
+
+ **Extra Credit:** Instead of doing 1d vector for input to output, matrix multiplication of the weights is used between each layer of the overall network as well as in back propagation.
+
+## Traning Result
+### XOR Example
+ - input neuron: 2
+ - hidden neuron: 2
+ - output neuron: 1
+ - batch size: 4
+ - initial weights: As given in the Excel
+ - learning rate: 10^-2 fixed
+
+The initial weights are set according to the Excel example. Values generated in the first pass is compared and verified. The graph below shows the change of total error in the first 200 epoches.
+
+![](img/xor_curve.jpg)
+
+Debug Output:
+```
+--- XOR Example ---
+epoch: 0 | cost: 0.00286369
+epoch: 10 | cost: 0.00229358
+epoch: 20 | cost: 0.00190236
+epoch: 30 | cost: 0.00162444
+epoch: 40 | cost: 0.00142166
+epoch: 50 | cost: 0.00127057
+epoch: 60 | cost: 0.0011561
+epoch: 70 | cost: 0.00106817
+epoch: 80 | cost: 0.000999872
+epoch: 90 | cost: 0.000946314
+epoch: 100 | cost: 0.000903958
+epoch: 110 | cost: 0.000870197
+epoch: 120 | cost: 0.000843124
+epoch: 130 | cost: 0.000821268
+epoch: 140 | cost: 0.000803517
+epoch: 150 | cost: 0.000789034
+epoch: 160 | cost: 0.000777127
+epoch: 170 | cost: 0.000767323
+epoch: 180 | cost: 0.000759189
+epoch: 190 | cost: 0.000752402
+```
+
+### Character Recognition
+ - input neuron: 10201
+ - hidden neuron: 128
+ - output neuron: 52
+ - batch size: 52
+ - initial weights: random
+ - learning rate: 10^-4 fixed
+ 
+In the character recgnition dataset, we are given 52 101x101 images representing each of the 26 letters in the alphabet with and without capitalization. Each input is a 1x10201 array containing the pixel values of the image, and the correspond label is a 1x52 one hot vector. These data are used directly in training without further preprocessing.
+
+The graph below shows the change of total error in the first 200 epoches.
+
+ ![](img/image_curve.jpg)
+
+ Another graph shows the evolution of prediction accuracy in the training set. The network reaches 100% accuracy on the training set at about 80th epoch. The cost continues to decrease after achieving 100% accuracy on the training set.
+
+ ![](img/image_corr.jpg)
+
+Debug Output:
+```
+--- Character Recognition ---
+epoch: 0 | cost: 0.291311 | correct: 82.6923% | time elapsed: 32.632 ms
+epoch: 10 | cost: 0.142109 | correct: 96.1538% | time elapsed: 26.9052 ms
+epoch: 20 | cost: 0.0961198 | correct: 100% | time elapsed: 26.2422 ms
+epoch: 30 | cost: 0.063884 | correct: 100% | time elapsed: 24.8418 ms
+epoch: 40 | cost: 0.0623333 | correct: 98.0769% | time elapsed: 26.4026 ms
+epoch: 50 | cost: 0.0497727 | correct: 100% | time elapsed: 26.6263 ms
+epoch: 60 | cost: 0.0461378 | correct: 100% | time elapsed: 26.3336 ms
+epoch: 70 | cost: 0.0356991 | correct: 100% | time elapsed: 27.0863 ms
+epoch: 80 | cost: 0.0315837 | correct: 100% | time elapsed: 26.8796 ms
+epoch: 90 | cost: 0.0304885 | correct: 100% | time elapsed: 28.085 ms
+epoch: 100 | cost: 0.026108 | correct: 100% | time elapsed: 25.3946 ms
+epoch: 110 | cost: 0.0254792 | correct: 100% | time elapsed: 26.126 ms
+epoch: 120 | cost: 0.0291465 | correct: 100% | time elapsed: 25.2853 ms
+epoch: 130 | cost: 0.0230684 | correct: 100% | time elapsed: 21.3816 ms
+epoch: 140 | cost: 0.0249249 | correct: 100% | time elapsed: 26.5961 ms
+epoch: 150 | cost: 0.0206818 | correct: 100% | time elapsed: 26.4256 ms
+epoch: 160 | cost: 0.0191929 | correct: 100% | time elapsed: 25.502 ms
+epoch: 170 | cost: 0.0171115 | correct: 100% | time elapsed: 27.0411 ms
+epoch: 180 | cost: 0.0167711 | correct: 100% | time elapsed: 24.4687 ms
+epoch: 190 | cost: 0.0161567 | correct: 100% | time elapsed: 27.0288 ms
+---
+Target Label:1, Predicted Label:1
+Target Label:2, Predicted Label:2
+Target Label:3, Predicted Label:3
+Target Label:4, Predicted Label:4
+Target Label:5, Predicted Label:5
+Target Label:6, Predicted Label:6
+Target Label:7, Predicted Label:7
+Target Label:8, Predicted Label:8
+Target Label:9, Predicted Label:9
+Target Label:10, Predicted Label:10
+Target Label:11, Predicted Label:11
+Target Label:12, Predicted Label:12
+Target Label:13, Predicted Label:13
+Target Label:14, Predicted Label:14
+Target Label:15, Predicted Label:15
+Target Label:16, Predicted Label:16
+Target Label:17, Predicted Label:17
+Target Label:18, Predicted Label:18
+Target Label:19, Predicted Label:19
+Target Label:20, Predicted Label:20
+Target Label:21, Predicted Label:21
+Target Label:22, Predicted Label:22
+Target Label:23, Predicted Label:23
+Target Label:24, Predicted Label:24
+Target Label:25, Predicted Label:25
+Target Label:26, Predicted Label:26
+Target Label:27, Predicted Label:27
+Target Label:28, Predicted Label:28
+Target Label:29, Predicted Label:29
+Target Label:30, Predicted Label:30
+Target Label:31, Predicted Label:31
+Target Label:32, Predicted Label:32
+Target Label:33, Predicted Label:33
+Target Label:34, Predicted Label:34
+Target Label:35, Predicted Label:35
+Target Label:36, Predicted Label:36
+Target Label:37, Predicted Label:37
+Target Label:38, Predicted Label:38
+Target Label:39, Predicted Label:39
+Target Label:40, Predicted Label:40
+Target Label:41, Predicted Label:41
+Target Label:42, Predicted Label:42
+Target Label:43, Predicted Label:43
+Target Label:44, Predicted Label:44
+Target Label:45, Predicted Label:45
+Target Label:46, Predicted Label:46
+Target Label:47, Predicted Label:47
+Target Label:48, Predicted Label:48
+Target Label:49, Predicted Label:49
+Target Label:50, Predicted Label:50
+Target Label:51, Predicted Label:51
+Target Label:52, Predicted Label:52
+```
diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
index 7446175..9e834c1 100644
--- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt
+++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt
@@ -7,5 +7,5 @@ set(SOURCE_FILES
 
 cuda_add_library(character_recognition
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu
index 5a3ed7f..955e152 100644
--- a/Project2-Character-Recognition/character_recognition/mlp.cu
+++ b/Project2-Character-Recognition/character_recognition/mlp.cu
@@ -2,26 +2,290 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "mlp.h"
+#include <cublas_v2.h>
+#include <iostream>
+#include <thrust/random.h>
+
+// cuBLAS matrix multiplication
+void gpu_blas_mmul(cublasHandle_t &handle, const float *A, const float *B, float *C, const int m, const int k, const int n) {
+  const float alpha = 1.;
+  const float beta = 0.;
+  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, m, B, k, &beta, C, m);
+}
+
+// cuBLAS matrix transpose
+void gpu_blas_mtrans(cublasHandle_t &handle, const float *A, float *B, const int m, const int n) {
+  float alpha = 1.;
+  float beta = 0.;
+  cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, A, n, &beta, A, n, B, m);
+}
 
 namespace CharacterRecognition {
-    using Common::PerformanceTimer;
-    PerformanceTimer& timer()
-    {
-        static PerformanceTimer timer;
-        return timer;
-    }
+  using Common::PerformanceTimer;
+  PerformanceTimer& timer()
+  {
+      static PerformanceTimer timer;
+      return timer;
+  }
+
+  const int blockSize = 128;
         
-    // TODO: __global__
+  // Kernals
+  __global__ void kernCalcSigmoid(int n, float *out, float *in) {
+    int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (idx >= n) { 
+      return; 
+    }
+    out[idx] = 1 / (1 + exp(-in[idx]));
+  }
+
+  __global__ void kernCalcSigmoidDerivative(int n, float *out, float *in) {
+    int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (idx >= n) {
+      return;
+    }
+    float sigma = 1 / (1 + exp(-in[idx]));
+    out[idx] = (1 - sigma) * sigma;
+  }
+
+  __global__ void kernElementWiseMultiplication(int n, float *out, float *in1, float *in2) {
+    int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (idx >= n) {
+      return;
+    }
+    out[idx] = in1[idx] * in2[idx];
+  }
+
+  __global__ void kernUpdateWeight(int n, float *weights, float *gradients, float lambda) {
+    int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (idx >= n) {
+      return;
+    }
+    weights[idx] = weights[idx] - gradients[idx] * lambda;
+  }
+
+  __global__ void kernElementWiseSubtraction(int n_cols, int n_rows, float *out, float *in1, float *in2) {
+    int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (idx >= n_cols * n_rows) {
+      return;
+    }
+    out[idx] = in1[idx] - in2[idx];
+  }
+
+  __global__ void kernCalcSquareError(int n, float *out, float *in1, float *in2) {
+    int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (idx >= n) {
+      return;
+    }
+    float diff = in1[idx] - in2[idx];
+    out[idx] = diff * diff;
+  }
+
+  __host__ __device__ unsigned int hash(unsigned int a) {
+    a = (a + 0x7ed55d16) + (a << 12);
+    a = (a ^ 0xc761c23c) ^ (a >> 19);
+    a = (a + 0x165667b1) + (a << 5);
+    a = (a + 0xd3a2646c) ^ (a << 9);
+    a = (a + 0xfd7046c5) + (a << 3);
+    a = (a ^ 0xb55a4f09) ^ (a >> 16);
+    return a;
+  }
+
+  __global__ void kernRandomNumber(int n, int time, float* array) {
+    int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (idx >= n) {
+      return;
+    }
+    thrust::default_random_engine rng(hash((int)(idx * time)));
+    thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
+    array[idx] = (float)unitDistrib(rng);
+  }
+
+	// MLP3 function definitions
+
+  MLP3::MLP3(int input_size, int hidden_size, int output_size) {
+    input_size_ = input_size;
+    hidden_size_ = hidden_size;
+    output_size_ = output_size;
+
+    cudaMalloc((void**)&wkj_, input_size_ * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&wji_, hidden_size_ * output_size_ * sizeof(float));
+    cudaMalloc((void**)&gwkj_, input_size_ * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&gwji_, hidden_size_ * output_size_ * sizeof(float));
+
+    // create cublasHandle
+    cublasCreate(&cublas_handle_);
+  }
+
+  MLP3::~MLP3() {
+    cudaFree(wkj_);
+    cudaFree(wji_);
+    cudaFree(gwkj_);
+    cudaFree(gwji_);
+
+    // destroy cublasHandle
+    cublasDestroy(cublas_handle_);
+  }
 
-    /**
-        * Example of use case (follow how you did it in stream compaction)
-        */
-    /*void scan(int n, int *odata, const int *idata) {
-        timer().startGpuTimer();
-        // TODO
-        timer().endGpuTimer();
+  void MLP3::init_weights(float *wkj, float *wji) {
+    cudaMemcpy(wkj_, wkj, sizeof(float) * input_size_ * hidden_size_, cudaMemcpyHostToDevice);
+    cudaMemcpy(wji_, wji, sizeof(float) * hidden_size_ * output_size_, cudaMemcpyHostToDevice);
+  }
+
+  void MLP3::init_weights() {
+    // TODO: initialize random weights
+    int n_w1 = input_size_ * hidden_size_;
+    int gridSize1 = (n_w1 + blockSize - 1) / blockSize;
+    kernRandomNumber<<<gridSize1, blockSize>>>(n_w1, 1, wji_);
+
+    int n_w2 = hidden_size_ * output_size_;
+    int gridSize2 = (n_w2 + blockSize - 1) / blockSize;
+    kernRandomNumber<<<gridSize1, blockSize>>>(n_w2, 1, wkj_);
+  }
+
+  void MLP3::train(float *x_train, float *y_train, int n_data, int n_epoch, bool verbose) {
+    n_data_ = n_data;
+    cudaMalloc((void**)&dev_input_, n_data * input_size_ * sizeof(float));
+    cudaMalloc((void**)&dev_target_, n_data * output_size_ * sizeof(float));
+    cudaMemcpy(dev_input_, x_train, n_data * input_size_ * sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dev_target_, y_train, n_data * output_size_ * sizeof(float), cudaMemcpyHostToDevice);
+
+    cudaMalloc((void**)&dev_hidden_, n_data * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&dev_hidden_sigmoid_, n_data * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&dev_output_, n_data * output_size_ * sizeof(float));
+    cudaMalloc((void**)&dev_output_sigmoid_, n_data * output_size_ * sizeof(float));
+
+    for (int epoch = 0; epoch < n_epoch; epoch++) {
+      forward();
+      back_propagation();
+      calculate_loss();
+      update_weights();
+      if (verbose && ((epoch == n_epoch - 1) || (epoch + 1) % 5 == 0)) {
+        std::cout << "epoch: " << epoch + 1 << ", cost: " << total_error_ << std::endl;
+      }
+    }
+
+    cudaFree(dev_input_);
+    cudaFree(dev_target_);
+    cudaFree(dev_hidden_);
+    cudaFree(dev_hidden_sigmoid_);
+    cudaFree(dev_output_);
+    cudaFree(dev_output_sigmoid_);
+  }
+
+  void MLP3::predict(float *x_input, float *y_pred, int n_data) {
+    n_data_ = n_data;
+    cudaMalloc((void**)&dev_input_, n_data * input_size_ * sizeof(float));
+    cudaMemcpy(dev_input_, x_input, n_data * input_size_ * sizeof(float), cudaMemcpyHostToDevice);
+    
+    cudaMalloc((void**)&dev_hidden_, n_data * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&dev_hidden_sigmoid_, n_data * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&dev_output_, n_data * output_size_ * sizeof(float));
+    cudaMalloc((void**)&dev_output_sigmoid_, n_data * output_size_ * sizeof(float));
+
+    forward();
+    cudaMemcpy(y_pred, dev_output_sigmoid_, n_data * output_size_ * sizeof(float), cudaMemcpyDeviceToHost);
+    
+    cudaFree(dev_input_);
+    cudaFree(dev_hidden_);
+    cudaFree(dev_hidden_sigmoid_);
+    cudaFree(dev_output_);
+    cudaFree(dev_output_sigmoid_);
+  }
+
+  void MLP3::forward() {
+    gpu_blas_mmul(cublas_handle_, dev_input_, wkj_, dev_hidden_, n_data_, input_size_, hidden_size_);
+
+    dim3 gridSize1((hidden_size_ * n_data_ + blockSize - 1) / blockSize);
+    kernCalcSigmoid<<<gridSize1, blockSize>>>(hidden_size_ * n_data_, dev_hidden_sigmoid_, dev_hidden_);
+
+    gpu_blas_mmul(cublas_handle_, dev_hidden_sigmoid_, wji_, dev_output_, n_data_, hidden_size_, output_size_);
+
+    dim3 gridSize2((output_size_ * n_data_ + blockSize - 1) / blockSize);
+    kernCalcSigmoid<<<gridSize2, blockSize>>>(output_size_ * n_data_, dev_output_sigmoid_, dev_output_);
+  }
+
+  void MLP3::back_propagation() {
+    float* temp1, *temp2, *temp3, *temp4, *temp5, *temp6, *temp7, *temp8, *wji_T;
+
+    cudaMalloc((void**)&temp1, n_data_ * output_size_ * sizeof(float));
+    cudaMalloc((void**)&temp2, n_data_ * output_size_ * sizeof(float));
+    cudaMalloc((void**)&temp3, n_data_ * output_size_ * sizeof(float));
+    cudaMalloc((void**)&temp4, n_data_ * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&temp5, n_data_ * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&temp6, n_data_ * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&wji_T, output_size_ * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&temp7, n_data_ * hidden_size_ * sizeof(float));
+    cudaMalloc((void**)&temp8, n_data_ * input_size_ * sizeof(float));
+
+    // calculate wji gradient
+    int num_element1 = output_size_ * n_data_;
+    dim3 gridSize1((num_element1 + blockSize - 1) / blockSize);
+    kernCalcSigmoidDerivative<<<gridSize1, blockSize>>>(num_element1, temp1, dev_output_sigmoid_);
+    kernElementWiseSubtraction<<<gridSize1, blockSize>>>(n_data_, output_size_, temp2, dev_output_sigmoid_, dev_target_);
+    kernElementWiseMultiplication<<<gridSize1, blockSize>>>(num_element1, temp3, temp1, temp2);
+
+    gpu_blas_mtrans(cublas_handle_, dev_hidden_sigmoid_, temp4, hidden_size_, n_data_);
+    gpu_blas_mmul(cublas_handle_, temp4, temp3, gwji_, hidden_size_, n_data_, output_size_);
+
+    // calculate wkj gradient
+    int num_element2 = hidden_size_ * n_data_;
+    dim3 gridSize2((num_element2 + blockSize - 1) / blockSize);
+    kernCalcSigmoidDerivative<<<gridSize2, blockSize>>>(num_element2, temp5, dev_hidden_sigmoid_);
+    gpu_blas_mtrans(cublas_handle_, wji_, wji_T, output_size_, hidden_size_);
+    gpu_blas_mmul(cublas_handle_, temp3, wji_T, temp6, n_data_, output_size_, hidden_size_);
+    kernElementWiseMultiplication<<<gridSize2, blockSize>>>(num_element2, temp7, temp6, temp5);
+
+    gpu_blas_mtrans(cublas_handle_, dev_input_, temp8, input_size_, n_data_);
+    gpu_blas_mmul(cublas_handle_, temp8, temp7, gwkj_, input_size_, n_data_, hidden_size_);
+
+    // free cuda memory
+    cudaFree(temp1);
+    cudaFree(temp2);
+    cudaFree(temp3);
+    cudaFree(temp4);
+    cudaFree(temp5);
+    cudaFree(temp6);
+    cudaFree(temp7);
+    cudaFree(temp8);
+    cudaFree(wji_T);
+  }
+
+  void MLP3::calculate_loss() {
+    float *dev_square_error;
+    int n = n_data_ * output_size_;
+    cudaMalloc((void**)&dev_square_error, sizeof(float) * n);
+    int gridSize = (n + blockSize - 1) / blockSize;
+    kernCalcSquareError<<<gridSize, blockSize>>>(n, dev_square_error, dev_target_, dev_output_sigmoid_);
+    
+    float *square_error = new float[n]();
+    cudaMemcpy(square_error, dev_square_error, n * sizeof(float), cudaMemcpyDeviceToHost);
+
+    total_error_ = 0.;
+    for (int i = 0; i < n; i++) {
+      total_error_ += square_error[i];
     }
-    */
+    total_error_ /= (2.0 * output_size_);
+
+    delete[] square_error;
+    cudaFree(dev_square_error);
+  }
+
+  void MLP3::update_weights() {
+    float lambda = 0.01;
+
+    float* temp1 = new float(2);
+    cudaMemcpy(temp1, wji_, 2 * sizeof(float), cudaMemcpyDeviceToHost);
+
+    // update wji
+    int gridSize1 = (hidden_size_ * output_size_ + blockSize) / blockSize;
+    kernUpdateWeight<<<gridSize1, blockSize>>>(hidden_size_ * output_size_, wji_, gwji_, lambda);
+    
+    float* temp2 = new float(2);
+    cudaMemcpy(temp2, wji_, 2 * sizeof(float), cudaMemcpyDeviceToHost);
 
-	// TODO: implement required elements for MLP sections 1 and 2 here
+    // update wkj
+    int gridSize2 = (input_size_ * hidden_size_ + blockSize) / blockSize;
+    kernUpdateWeight<<<gridSize2, blockSize>>>(input_size_ * hidden_size_, wkj_, gwkj_, lambda);
+  }
 }
diff --git a/Project2-Character-Recognition/character_recognition/mlp.h b/Project2-Character-Recognition/character_recognition/mlp.h
index 2096228..d2d47e7 100644
--- a/Project2-Character-Recognition/character_recognition/mlp.h
+++ b/Project2-Character-Recognition/character_recognition/mlp.h
@@ -1,9 +1,54 @@
 #pragma once
 
 #include "common.h"
+#include <cublas_v2.h>
 
 namespace CharacterRecognition {
     Common::PerformanceTimer& timer();
+    
+    class MLP3 {
+    public:
+      // constructor
+      MLP3(int n_input, int n_hidden, int n_output);
 
-    // TODO: implement required elements for MLP sections 1 and 2 here
+      // destructor
+      ~MLP3();
+
+      // accessor
+      float total_error() const {
+        return total_error_;
+      }
+      
+      void init_weights(float *wkj, float *wji);
+      void init_weights();
+      void predict(float *x_input, float *y_pred, int n_data);
+      void train(float *x_train, float *y_train, int n_data, int n_epoch, bool verbose);
+
+    private:
+      void forward();
+      void calculate_loss();
+      void back_propagation();
+      void update_weights();
+
+      int input_size_;
+      int hidden_size_;
+      int output_size_;
+
+      // weight & gradients
+      float *wkj_, *wji_;
+      float *gwkj_, *gwji_;
+
+      // layers
+      int n_data_;
+      float* dev_input_;
+      float* dev_hidden_;
+      float* dev_hidden_sigmoid_;
+      float* dev_output_;
+      float* dev_output_sigmoid_;
+      float* dev_target_;
+
+      float total_error_;
+
+      cublasHandle_t cublas_handle_;
+    };
 }
diff --git a/Project2-Character-Recognition/img/image_corr.jpg b/Project2-Character-Recognition/img/image_corr.jpg
new file mode 100644
index 0000000..9c98774
Binary files /dev/null and b/Project2-Character-Recognition/img/image_corr.jpg differ
diff --git a/Project2-Character-Recognition/img/image_curve.jpg b/Project2-Character-Recognition/img/image_curve.jpg
new file mode 100644
index 0000000..c856589
Binary files /dev/null and b/Project2-Character-Recognition/img/image_curve.jpg differ
diff --git a/Project2-Character-Recognition/img/xor_curve.jpg b/Project2-Character-Recognition/img/xor_curve.jpg
new file mode 100644
index 0000000..ab443eb
Binary files /dev/null and b/Project2-Character-Recognition/img/xor_curve.jpg differ
diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp
index 11dd534..adde1dd 100644
--- a/Project2-Character-Recognition/src/main.cpp
+++ b/Project2-Character-Recognition/src/main.cpp
@@ -1,152 +1,168 @@
 /**
  * @file      main.cpp
- * @brief     Stream compaction test program
- * @authors   Kai Ninomiya
- * @date      2015
+ * @brief     Character Recognition test program
+ * @authors   Zheyuan Xie
+ * @date      2019
  * @copyright University of Pennsylvania
  */
 
 #include <cstdio>
 #include <character_recognition/mlp.h>
 #include <character_recognition/common.h>
-#include "testing_helpers.hpp"
+#include "testing_helpers.hpp"''
 
-const int SIZE = 1 << 8; // feel free to change the size of array
-const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
+#include <string>
+#include <fstream>
+using namespace std;
+
+// find the index of the maximum element in an array.
+int find_max_idx(int n, float* array);
+bool test_image(CharacterRecognition::MLP3 &mlp, int label, bool verbose);
+
+void train_XOR() {
+  const int input_size = 2;
+  const int hidden_size = 4;
+  const int output_size = 1;
+
+  CharacterRecognition::MLP3 xor_mlp(input_size, hidden_size, output_size);
+
+  // initial weights (from Hannah excel)
+  float *wkj = new float[input_size * hidden_size];
+  wkj[0] = 10.1;
+  wkj[1] = 20;
+  wkj[2] = 0.9;
+  wkj[3] = 0.87;
+  float *wji = new float[hidden_size * output_size];
+  wji[0] = 41;
+  wji[1] = -54;
+  xor_mlp.init_weights(wkj, wji);
+
+
+  // training input (row major)
+  const int num_inputs = 4;
+  float *xor_input = new float[num_inputs * input_size];
+  float *xor_label = new float[num_inputs * output_size];
+
+  xor_input[0] = 0; xor_input[4] = 0;
+  xor_label[0] = 0;
+
+  xor_input[1] = 0; xor_input[5] = 1;
+  xor_label[1] = 1;
+
+  xor_input[2] = 1; xor_input[6] = 0;
+  xor_label[2] = 1;
+
+  xor_input[3] = 1; xor_input[7] = 1;
+  xor_label[3] = 0;
+
+  cout << "--- XOR Example ---" << endl;
+  for (int t = 0; t < 20; t++) {
+    // Train
+    xor_mlp.train(xor_input, xor_label, num_inputs, 10, false);
+    float cost = xor_mlp.total_error();
+
+    std::cout << "epoch: " << t * 10 << " | cost: " << cost << std::endl;
+  }
+}
+
+void train_image() {
+  const int input_size = 10201;
+  const int hidden_size = 128;
+  const int output_size = 52;
+
+  CharacterRecognition::MLP3 image_mlp(input_size, hidden_size, output_size);
+  image_mlp.init_weights();
+
+  // training input (row major)
+  const int num_inputs = 52;
+  float *image_input = new float[num_inputs * input_size];
+  float *image_label = new float[num_inputs * output_size];
+  memset(image_label, 0, sizeof(float) * num_inputs * output_size);
+
+  for (int i = 0; i < 52; i++) {
+    string file_idx = to_string(i+1);
+    if (file_idx.length() == 1) {
+      file_idx = "0" + file_idx;
+    }
+    string file_name = "../data-set/" + file_idx + "info.txt";
+    ifstream input_stream(file_name);
+    if (!input_stream) cerr << "Can't open input file:" << file_name << endl;
+    int first, second;
+    input_stream >> first;
+    input_stream >> second;
+    for (int j = 0; j < input_size; j++) {
+      input_stream >> image_input[j * num_inputs + i];
+    }
+    image_label[i * num_inputs + i] = 1.0;
+  }
+
+  cout << "--- Character Recognition ---" << endl;
+  for (int t = 0; t < 20; t++) {
+    // Train
+    CharacterRecognition::timer().startCpuTimer();
+    image_mlp.train(image_input, image_label, num_inputs, 10, false);
+    CharacterRecognition::timer().endCpuTimer();
+    float time_elapsed = CharacterRecognition::timer().getCpuElapsedTimeForPreviousOperation();
+    float cost = image_mlp.total_error();
+
+    // Test
+    int correct_cnt = 0;
+    for (int i = 1; i <= 52; i++) {
+      if (test_image(image_mlp, i, false)) {
+        correct_cnt++;
+      }
+    }
+    std::cout << "epoch: " << t * 10 << " | cost: " << cost << " | correct: " << correct_cnt / 52.0 * 100 << "% | time elapsed: " << time_elapsed << " ms" << std::endl;
+  }
+  cout << "---" << endl;
+  for (int i = 1; i <= 52; i++) {
+    if (test_image(image_mlp, i, true)) {
+    }
+  }
+}
+
+bool test_image(CharacterRecognition::MLP3 &mlp, int label, bool verbose) {
+  string file_idx = to_string(label);
+  if (file_idx.length() == 1) {
+    file_idx = "0" + file_idx;
+  }
+  string file_name = "../data-set/" + file_idx + "info.txt";
+  ifstream input_stream(file_name);
+  if (!input_stream) cerr << "Can't open input file:" << file_name << endl;
+  int first, second;
+  input_stream >> first;
+  input_stream >> second;
+  float *test_input = new float[10201];
+  float *test_output = new float[52];
+  for (int j = 0; j < 10201; j++) {
+    input_stream >> test_input[j];
+  }
+  mlp.predict(test_input, test_output, 1);
+  int label_output = find_max_idx(52, test_output) + 1;
+  if (verbose) {
+    cout << "Target Label:" << label << ", Predicted Label:" << label_output << endl;
+  }
+
+  delete[] test_input;
+  delete[] test_output;
+
+  return label_output == label;
+}
+
+int find_max_idx(int n, float* array) {
+  float max = FLT_MIN;
+  int idx = 0;
+  for (int i = 0; i < n; i++) {
+    if (array[i] > max) {
+      idx = i;
+      max = array[i];
+    }
+  }
+  return idx;
+}
 
 int main(int argc, char* argv[]) {
-    // Scan tests
-
-    printf("\n");
-    printf("****************\n");
-    printf("** SCAN TESTS **\n");
-    printf("****************\n");
-
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    // initialize b using StreamCompaction::CPU::scan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
-    // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
-    printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-	/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-	onesArray(SIZE, c);
-	printDesc("1s array for finding bugs");
-	StreamCompaction::Naive::scan(SIZE, c, a);
-	printArray(SIZE, c, true); */
-
-    zeroArray(SIZE, c);
-    printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
-
-    printf("\n");
-    printf("*****************************\n");
-    printf("** STREAM COMPACTION TESTS **\n");
-    printf("*****************************\n");
-
-    // Compaction tests
-
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
-
-    int count, expectedCount, expectedNPOT;
-
-    // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
-    // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
-    printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
-    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
-
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
-
-    system("pause"); // stop Win32 console from closing on exit
-	delete[] a;
-	delete[] b;
-	delete[] c;
+  train_XOR();
+  cout << endl;
+  train_image();
 }
diff --git a/Project2-Stream-Compaction/README.md b/Project2-Stream-Compaction/README.md
index 0e38ddb..9d3694f 100644
--- a/Project2-Stream-Compaction/README.md
+++ b/Project2-Stream-Compaction/README.md
@@ -3,12 +3,119 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zheyuan Xie
+* Tested on: Windows 10 Pro, i7-7700HQ @ 2.80GHz 2.80GHz, 16GB, GTX 1050 2GB (Dell XPS 15 9560)
 
-### (TODO: Your README)
+## Description
+Stream compaction is a common parallel primitive used to remove unwanted elements in sparse data. In this project, we want to removes zeros from a integer array. An exclusive scan that is used to compute scatter indices is the core algorithm in stream compaction. 
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+The following scan method is implemented:
+ - CPU Scan
+ - Naive GPU Scan
+ - Work-Efficient GPU Scan
 
+We also compared the above implementations to that provided in thrust library: `thrust::exclusive_scan`.
+
+## Performance Analysis
+### Blocksize Optimization
+The graph below shows work efficient compact runtime with power of two elements (2^24) versus block size. The minimum runtime is achieved with a block size of 128. Therefore we pick 128 as the default block size in all the implementations.
+
+![](img/blocksize_opt.jpg)
+
+### Runtime Comparison
+The graphs below compare the runtime of different scan implementations.
+
+![](img/scan_perf_small.jpg)
+ - When the array size is small, CPU scan has the best performance. This is probably due to the CUDA kernel overhead. Executing a kernel on the GPU is a relatively expensive operation.
+
+![](img/scan_perf.jpg)
+ - The advantage of using GPU begins to dominate when the array size is large (> 2^15).
+ - Thrust performed significently better. The reason is probably it utilizes shared memory instead of the much slower global memory.
+ - Work efficient scan performs worse than naive scan, which is unexpected. This will be discussed later. 
+ - Another unexplained phenomenon is CPU scan with non-power-of-two array size has significantly better performance (even better than GPU) than power-of-two array size. It could be a bug in the test program.
+
+### Extra Credit: Why is My GPU Approach So Slow?
+The reason why work efficient scan is slower than expected is although the number of arithmetic operation is reduced, the kernel call is launching too much unnecessary threads. This effect has more influence on runtime when the array size is large.
+
+To resolve this, both the kernel code and calling code is altered so that only necessary number of threads are launched. The thread index is converted to the array index by multiplying stride length.
+
+Before optimization:
+ - number of threads = 2^dmax
+ - In kernel:
+```
+if (k < n && k % (1 << (d+1)) == 0) {
+    array_idx = k;
+    ...
+}
+```
+
+After optimization:
+ - number of threads = 2^(dmax - 1 - d)
+ - In kernel:
+```
+if (k < n) {
+    array_idx = k * (1 << (d+1));
+    ...
+}
+```
+
+After optimization, work efficient is significantly improved, outperforming naive scan and cpu scan.
+![](img/work_efficient_optimized.jpg)
+
+### Test Output
+ - In `common.h`, BLOCKSIZE = 128.
+ - In `main.cpp`, SIZE = 2 << 15 (2^15).
+
+```
+****************
+** SCAN TESTS **
+****************
+    [   4   1  35  47  28  20  17  24  37  49  31  26  43 ...  33   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.058ms    (std::chrono Measured)
+    [   0   4   5  40  87 115 135 152 176 213 262 293 319 ... 802776 802809 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.0582ms    (std::chrono Measured)
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.059136ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.057184ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.156736ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.151712ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.06144ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.05952ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   1   1   2   0   1   0   1   1   1   2   1 ...   1   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.1095ms    (std::chrono Measured)
+    [   2   3   1   1   2   1   1   1   1   2   1   1   2 ...   2   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.1079ms    (std::chrono Measured)
+    [   2   3   1   1   2   1   1   1   1   2   1   1   2 ...   1   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.2213ms    (std::chrono Measured)
+    [   2   3   1   1   2   1   1   1   1   2   1   1   2 ...   2   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.213024ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.175968ms    (CUDA Measured)
+    passed
+```
diff --git a/Project2-Stream-Compaction/img/blocksize_opt.jpg b/Project2-Stream-Compaction/img/blocksize_opt.jpg
new file mode 100644
index 0000000..2a4f57f
Binary files /dev/null and b/Project2-Stream-Compaction/img/blocksize_opt.jpg differ
diff --git a/Project2-Stream-Compaction/img/scan_perf.jpg b/Project2-Stream-Compaction/img/scan_perf.jpg
new file mode 100644
index 0000000..99fb252
Binary files /dev/null and b/Project2-Stream-Compaction/img/scan_perf.jpg differ
diff --git a/Project2-Stream-Compaction/img/scan_perf_small.jpg b/Project2-Stream-Compaction/img/scan_perf_small.jpg
new file mode 100644
index 0000000..7359584
Binary files /dev/null and b/Project2-Stream-Compaction/img/scan_perf_small.jpg differ
diff --git a/Project2-Stream-Compaction/img/work_efficient_optimized.jpg b/Project2-Stream-Compaction/img/work_efficient_optimized.jpg
new file mode 100644
index 0000000..15bf2f2
Binary files /dev/null and b/Project2-Stream-Compaction/img/work_efficient_optimized.jpg differ
diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp
index d016553..507ebee 100644
--- a/Project2-Stream-Compaction/src/main.cpp
+++ b/Project2-Stream-Compaction/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 16; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -44,7 +44,7 @@ int main(int argc, char* argv[]) {
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
+    //printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
index cdbef77..4bb0dc2 100644
--- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
+++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu
index 2ed6d63..3e0a828 100644
--- a/Project2-Stream-Compaction/stream_compaction/common.cu
+++ b/Project2-Stream-Compaction/stream_compaction/common.cu
@@ -23,7 +23,11 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+          // TODO
+          int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+          if (idx < n) {
+            bools[idx] = (idata[idx] == 0) ? 0 : 1;
+          }
         }
 
         /**
@@ -31,8 +35,12 @@ namespace StreamCompaction {
          * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
          */
         __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices) {
-            // TODO
+          const int *idata, const int *bools, const int *indices) {
+          // TODO
+          int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+          if (idx < n && bools[idx]) {
+            odata[indices[idx]] = idata[idx];
+          }
         }
 
     }
diff --git a/Project2-Stream-Compaction/stream_compaction/common.h b/Project2-Stream-Compaction/stream_compaction/common.h
index 996997e..95d30a2 100644
--- a/Project2-Stream-Compaction/stream_compaction/common.h
+++ b/Project2-Stream-Compaction/stream_compaction/common.h
@@ -12,6 +12,7 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define BLOCKSIZE 128
 
 /**
  * Check for CUDA errors; print and exit if there was a problem.
diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu
index a2d3e6c..301c03c 100644
--- a/Project2-Stream-Compaction/stream_compaction/cpu.cu
+++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu
@@ -12,15 +12,25 @@ namespace StreamCompaction {
 	        return timer;
         }
 
+        void scan_implementation(int n, int *odata, const int *idata) {
+          odata[0] = 0;
+          for (int i = 1; i < n; i++) {
+            odata[i] = odata[i - 1] + idata[i - 1];
+          }
+        }
+
         /**
          * CPU scan (prefix sum).
          * For performance analysis, this is supposed to be a simple for loop.
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
-        void scan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-            // TODO
-	        timer().endCpuTimer();
+        void scan(int n, int *odata, const int *idata) {	        
+          // TODO
+          timer().startCpuTimer();
+
+          scan_implementation(n, odata, idata);
+	        
+          timer().endCpuTimer();
         }
 
         /**
@@ -29,10 +39,20 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-            // TODO
+          // TODO
+
+          timer().startCpuTimer();
+
+          int cnt = 0;
+          for (int i = 0; i < n; i++) {
+            if (idata[i] != 0) {
+              odata[cnt++] = idata[i];
+            }
+          }
+
 	        timer().endCpuTimer();
-            return -1;
+
+          return cnt;
         }
 
         /**
@@ -41,10 +61,36 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
 	        // TODO
-	        timer().endCpuTimer();
-            return -1;
+          // allocate memory
+          int* isnonzero = (int*)malloc(n * sizeof(int));
+          int* indices = (int*)malloc(n * sizeof(int));
+          
+          timer().startCpuTimer();
+            
+          // identify non zero elements
+          for (int i = 0; i < n; i++) {
+            isnonzero[i] = (idata[i] != 0) ? 1 : 0;
+          }
+
+          // compute indices with an exclusive scan
+          scan_implementation(n, indices, isnonzero);
+          int n_compact = isnonzero[n - 1] ? indices[n - 1] + 1: indices[n - 1];
+
+          // scatter
+          for (int i = 0; i < n; i++) {
+            if (isnonzero[i]) {
+              odata[indices[i]] = idata[i];
+            }
+          }
+
+          timer().endCpuTimer();
+
+          // free memory
+          free(isnonzero);
+          free(indices);
+ 
+          return n_compact;
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu
index 2db346e..b395d24 100644
--- a/Project2-Stream-Compaction/stream_compaction/efficient.cu
+++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu
@@ -12,13 +12,61 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernUpSweep(int n, int pow2d, int *idata) {
+          int k = threadIdx.x + (blockIdx.x * blockDim.x);
+          if (k >= n) {
+            return;
+          }
+          int pow2d2 = pow2d * 2;
+          idata[k * pow2d2 + pow2d2 - 1] += idata[k * pow2d2 + pow2d - 1];
+        }
+
+        __global__ void kernDownSweep(int n, int pow2d, int *idata) {
+          int k = threadIdx.x + (blockIdx.x * blockDim.x);
+          if (k >= n) {
+            return;
+          }
+          int pow2d2 = pow2d * 2;
+          int t = idata[k * pow2d2 + pow2d - 1];
+          idata[k * pow2d2 + pow2d - 1] = idata[k * pow2d2 + pow2d2 - 1];
+          idata[k * pow2d2 + pow2d2 - 1] += t;
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO
+            int size = pow(2, ilog2ceil(n));
+            int memsize = size * sizeof(int);
+            int *dev_in;
+            cudaMalloc((void**)&dev_in, memsize);
+            cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            timer().startGpuTimer();
+
+            dim3 blockSize(BLOCKSIZE);
+            dim3 gridSize((size + BLOCKSIZE - 1) / BLOCKSIZE);
+            int dmax = ilog2ceil(n);
+
+            for (int d = 0; d < dmax; d++) {
+              int n_threads = 1 << (dmax - 1 - d);
+              gridSize = dim3((n_threads + BLOCKSIZE - 1) / BLOCKSIZE);
+              kernUpSweep << <gridSize, blockSize >> > (n_threads, 1 << d, dev_in);
+            }
+
+            cudaMemset((void*)&(dev_in[size - 1]), 0, sizeof(int));
+
+            for (int d = dmax - 1; d >= 0; d--) {
+              int n_threads = 1 << (dmax - 1 - d);
+              gridSize = dim3((n_threads + BLOCKSIZE - 1) / BLOCKSIZE);
+              kernDownSweep << <gridSize, blockSize >> > (n_threads, 1 << d, dev_in);
+            }
+
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_in);
         }
 
         /**
@@ -31,10 +79,76 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO
+            size_t size = n * sizeof(int);
+            int n2 = pow(2, ilog2ceil(n));
+            size_t size2 = n2 * sizeof(int);
+
+            // allocate device memory
+            int *dev_odata, *dev_idata, *dev_bools, *dev_indices;
+            cudaMalloc((void**)&dev_idata, size);
+            cudaMalloc((void**)&dev_odata, size);
+            cudaMalloc((void**)&dev_bools, size);
+            cudaMalloc((void**)&dev_indices, size2);
+
+            // allocate host memory
+            int *bools, *indices;
+            bools = (int*)malloc(size);
+            indices = (int*)malloc(size);
+            
+            // calculate block size and grid size
+            int gridSize = (n + BLOCKSIZE - 1) / BLOCKSIZE;
+            int gridSize2 = (n2 + BLOCKSIZE - 1) / BLOCKSIZE;
+            
+            // copy memory to device
+            cudaMemcpy(dev_idata, idata, size, cudaMemcpyHostToDevice);
+
+            timer().startGpuTimer();
+            
+            // identify non-zero elements
+            Common::kernMapToBoolean << <gridSize, BLOCKSIZE >> > (n, dev_bools, dev_idata);
+
+            // exclusive scan
+            cudaMemcpy(dev_indices, dev_bools, size, cudaMemcpyDeviceToDevice);
+
+            int dmax = ilog2ceil(n2);
+            int n_threads;
+            for (int d = 0; d < dmax; d++) {
+              int n_threads = 1 << (dmax - 1 - d);
+              gridSize2 = (n_threads + BLOCKSIZE - 1) / BLOCKSIZE;
+              kernUpSweep << <gridSize2, BLOCKSIZE >> > (n_threads, 1 << d, dev_indices);
+            }
+
+            cudaMemset((void*)&(dev_indices[n2 - 1]), 0, sizeof(int));
+
+            for (int d = dmax - 1; d >= 0; d--) {
+              int n_threads = 1 << (dmax - 1 - d);
+              gridSize2 = (n_threads + BLOCKSIZE - 1) / BLOCKSIZE;
+              kernDownSweep << <gridSize2, BLOCKSIZE >> > (n_threads, 1 << d, dev_indices);
+            }
+            
+            // stream compaction
+            Common::kernScatter << <gridSize, BLOCKSIZE >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);
+
             timer().endGpuTimer();
-            return -1;
+
+            // copy memory to host
+            cudaMemcpy(indices, dev_indices, size, cudaMemcpyDeviceToHost);
+            cudaMemcpy(bools, dev_bools, size, cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, dev_odata, size, cudaMemcpyDeviceToHost);
+            int n_compact = bools[n - 1] ? indices[n - 1] + 1 : indices[n - 1];
+             
+            // free device memory
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_bools);
+            cudaFree(dev_indices);
+
+            // free host memory
+            free(bools);
+            free(indices);
+
+            return n_compact;
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu
index 4308876..09e87e6 100644
--- a/Project2-Stream-Compaction/stream_compaction/naive.cu
+++ b/Project2-Stream-Compaction/stream_compaction/naive.cu
@@ -11,15 +11,52 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
+
         // TODO: __global__
+        __global__ void kernNaiveScan(int N, int lowbound, int *odata, int *idata) {
+          int thid = threadIdx.x + (blockIdx.x * blockDim.x);
+          if (thid >= lowbound) {
+            odata[thid] = idata[thid] + idata[thid - lowbound];
+          }
+          else {
+            odata[thid] = idata[thid];
+          }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO
+            // Declare two device buffer
+            int *dev_data[2];
+              
+            // Allocate device memory
+            cudaMalloc((void**)&dev_data[0], n * sizeof(int));
+            checkCUDAErrorFn("cudaMalloc dev_idata failed!");
+
+            cudaMalloc((void**)&dev_data[1], n * sizeof(int));
+            checkCUDAErrorFn("cudaMalloc dev_odata failed!");
+
+            // Copy data to the first device buffer
+            cudaMemcpy(dev_data[0], idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorFn("cudaMemcpy (host to device) failed!");
+            
+            timer().startGpuTimer();
+            dim3 fullBlocksPerGrid((n + BLOCKSIZE - 1) / BLOCKSIZE);
+            for (int d = 1; d <= ilog2ceil(n); d++) {
+              kernNaiveScan << <fullBlocksPerGrid, dim3(BLOCKSIZE) >> > (n, 1 << (d - 1), dev_data[d % 2], dev_data[(d + 1) % 2]);
+            }
             timer().endGpuTimer();
+
+            // Convert inclusive scan to exclusive scan, shift right and insert identity.
+            cudaMemcpy(odata + 1, dev_data[ilog2ceil(n) % 2], (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorFn("cudaMemcpy (device to host) failed!");
+            odata[0] = 0;
+            
+            // Free device memory
+            cudaFree(dev_data[0]);
+            cudaFree(dev_data[1]);
         }
     }
 }
diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu
index 1def45e..e9116c5 100644
--- a/Project2-Stream-Compaction/stream_compaction/thrust.cu
+++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu
@@ -18,11 +18,18 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::host_vector<int> host_in(idata, idata + n);
+            thrust::device_vector<int> device_in = host_in;
+            thrust::device_vector<int> device_out(n);
+
+            timer().startGpuTimer();
+            thrust::exclusive_scan(device_in.begin(), device_in.end(), device_out.begin());
             timer().endGpuTimer();
+
+            thrust::copy(device_out.begin(), device_out.end(), odata);
         }
     }
 }
diff --git a/README.md b/README.md
index 3a0b2fe..1d87221 100644
--- a/README.md
+++ b/README.md
@@ -3,14 +3,9 @@ CUDA Number Algorithms
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-Link to the readmes of the other two subprojects.
-
-Add anything else you think is relevant up to this point.
-(Remember, this is public, so don't put anything here that you don't want to share with the world.)
+* Zheyuan Xie
+* Tested on: Windows 10 Pro, i7-7700HQ @ 2.80GHz 2.80GHz, 16GB, GTX 1050 2GB (Dell XPS 15 9560)
 
+## Links to Subprojects
+ - [Stream Compaction](./Project2-Stream-Compaction/README.md)
+ - [Character Recognition](./Project2-Character-Recognition/README.md)