diff --git a/Neural Net Language Model/FProp.R b/Neural Net Language Model/FProp.R
index 31d145e..7186a24 100644
--- a/Neural Net Language Model/FProp.R	
+++ b/Neural Net Language Model/FProp.R	
@@ -1,110 +1,123 @@
+# Original FProp file, with minor modifications to accomodate R
+
 # library(Matrix)
 #library(matrixStats)
+library(R.matlab)
+library(matrixStats)
 
-fprop <- function(input_batch, weights, fn) {
-#      % This method forward propagates through a neural network.
-#      % Inputs:
-#      %   input_batch: The input data as a matrix of size numwords X batchsize where,
-#      %     numwords is the number of words, batchsize is the number of data points.
-#      %     So, if input_batch(i, j) = k then the ith word in data point j is word
-#      %     index k of the vocabulary.
-#      %
-#      %   word_embedding_weights: Word embedding as a matrix of size
-#      %     vocab_size X numhid1, where vocab_size is the size of the vocabulary
-#      %     numhid1 is the dimensionality of the embedding space.
-#      %
-#      %   embed_to_hid_weights: Weights between the word embedding layer and hidden
-#      %     layer as a matrix of size numhid1*numwords X numhid2, numhid2 is the
-#      %     number of hidden units.
-#      %
-#      %   hid_to_output_weights: Weights between the hidden layer and output softmax
-#      %               unit as a matrix of size numhid2 X vocab_size
-#      %
-#      %   hid_bias: Bias of the hidden layer as a matrix of size numhid2 X 1.
-#      %
-#      %   output_bias: Bias of the output layer as a matrix of size vocab_size X 1.
-#      %
-#      % Outputs:
-#      %   embedding_layer_state: State of units in the embedding layer as a matrix of
-#      %     size numhid1*numwords X batchsize
-#      %
-#      %   hidden_layer_state: State of units in the hidden layer as a matrix of size
-#      %     numhid2 X batchsize
-#      %
-#      %   output_layer_state: State of units in the output layer as a matrix of size
-#      %     vocab_size X batchsize
-#      %
-     
-     tmp <- dim(input_batch)  # basically dim
+fprop <- function(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) {
+     #      % This method forward propagates through a neural network.
+     #      % Inputs:
+     #      %   input_batch: The input data as a matrix of size numwords X batchsize where,
+     #      %     numwords is the number of words, batchsize is the number of data points.
+     #      %     So, if input_batch(i, j) = k then the ith word in data point j is word
+     #      %     index k of the vocabulary.
+     #      %
+     #      %   word_embedding_weights: Word embedding as a matrix of size
+     #      %     vocab_size X numhid1, where vocab_size is the size of the vocabulary
+     #      %     numhid1 is the dimensionality of the embedding space.
+     #      %
+     #      %   embed_to_hid_weights: Weights between the word embedding layer and hidden
+     #      %     layer as a matrix of size numhid1*numwords X numhid2, numhid2 is the
+     #      %     number of hidden units.
+     #      %
+     #      %   hid_to_output_weights: Weights between the hidden layer and output softmax
+     #      %               unit as a matrix of size numhid2 X vocab_size
+     #      %
+     #      %   hid_bias: Bias of the hidden layer as a matrix of size numhid2 X 1.
+     #      %
+     #      %   output_bias: Bias of the output layer as a matrix of size vocab_size X 1.
+     #      %
+     #      % Outputs:
+     #      %   embedding_layer_state: State of units in the embedding layer as a matrix of
+     #      %     size numhid1*numwords X batchsize
+     #      %
+     #      %   hidden_layer_state: State of units in the hidden layer as a matrix of size
+     #      %     numhid2 X batchsize
+     #      %
+     #      %   output_layer_state: State of units in the output layer as a matrix of size
+     #      %     vocab_size X batchsize
+     #      %
+     
+     tmp <- size(input_batch)  # basically dim
      numwords <- tmp[1]
      batchsize <- tmp[2]
      
-     tmp <- dim(weights$word_embedding)  # basically dim
+     tmp <- size(word_embedding_weights)  # basically dim
      vocab_size <- tmp[1]
      numhid1 <- tmp[2]
+     numhid2 <- size(embed_to_hid_weights, 2)
      
-     numhid2 <- ncol(weights$embed_to_hid)
-
-#      %% COMPUTE STATE OF WORD EMBEDDING LAYER.
-#      % Look up the inputs word indices in the word_embedding_weights matrix.
+     #      %% COMPUTE STATE OF WORD EMBEDDING LAYER.
+     #      % Look up the inputs word indices in the word_embedding_weights matrix.
      # each row of the word weights corresponds to a word (250 total)
      # input_batch contains 300 total words (3 * 100 batchsize)
      # each element of input_batch is a number between 1 and 249 (250?), corresponding to a word
      #embedding_layer_state2 <- matrix(as.numeric(weights$word_embedding[as.integer(input_batch), ]), nrow=numhid1 * numwords)
-     embedding_layer_state <- myReshape(weights$word_embedding[as.integer(input_batch),], nrows=numhid1 * numwords)    
+     #embedding_layer_state <- myReshape(weights$word_embedding[as.integer(input_batch),], nrows=numhid1 * numwords)    
+     #embedding_layer_state <- reshape(t(word_embedding_weights[reshape(input_batch, 1, []), ]), numhid1 * numwords, [])
+     
+     # [] is allowed in reshape: one dimension remains unspecified and Octave will determine it automatically
+     tmp <- t(word_embedding_weights[reshape(input_batch, 1, length(input_batch)), ])
+     embedding_layer_state <- reshape(tmp, numhid1 * numwords, length(tmp) / (numhid1 * numwords))
      
      #embedding_layer_state2 <- matlab::reshape(as.matrix(weights$word_embedding[as.numeric(input_batch), ]), numhid1*numwords, 100)
-                                
-#      %% COMPUTE STATE OF HIDDEN LAYER.
-#      % Compute inputs to hidden units.
+     
+     #      %% COMPUTE STATE OF HIDDEN LAYER.
+     #      % Compute inputs to hidden units.
      # crossprod = t(x) %*% y
-     inputs_to_hidden_units = myCrossProd(weights$embed_to_hid, embedding_layer_state) + fn(weights$hid_bias, 1, batchsize)     
-     
-#      benchmark(
-#           tmp1 <- repmat(weights$hid_bias, 1, batchsize),
-#           tmp2 <- myRepMat4(weights$hid_bias, 1, batchsize),
-#           replications=10
-#           )
-     
-     
-#      % Apply logistic activation function.
-#      % FILL IN CODE. Replace the line below by one of the options.
-#       hidden_layer_state = zeros(numhid2, batchsize)
-#      % Options
-#      % (a) hidden_layer_state = 1 ./ (1 + exp(inputs_to_hidden_units));
-#      % (b) hidden_layer_state = 1 ./ (1 - exp(-inputs_to_hidden_units));
-    hidden_layer_state = 1 / (1 + exp(-inputs_to_hidden_units))
-#      % (d) hidden_layer_state = -1 ./ (1 + exp(-inputs_to_hidden_units));
-     
-#      %% COMPUTE STATE OF OUTPUT LAYER.
-#      % Compute inputs to softmax.
-#      % FILL IN CODE. Replace the line below by one of the options.
-#       inputs_to_softmax = zeros(vocab_size, batchsize)
-#      % Options
-    inputs_to_softmax = myCrossProd(weights$hid_to_output, hidden_layer_state) +  fn(weights$output_bias, 1, batchsize)
-     
-#      % (b) inputs_to_softmax = t(hid_to_output_weights) %*% hidden_layer_state +  repmat(output_bias, batchsize, 1);
-#      % (c) inputs_to_softmax = hidden_layer_state %*% t(hid_to_output_weights) +  repmat(output_bias, 1, batchsize);
-#      % (d) inputs_to_softmax = hid_to_output_weights %*% hidden_layer_state +  repmat(output_bias, batchsize, 1);
-
-#      % Subtract maximum. 
-#      % Remember that adding or subtracting the same constant from each input to a
-#      % softmax unit does not affect the outputs. Here we are subtracting maximum to
-#      % make all inputs <= 0. This prevents overflows when computing their
-#      % exponents.
+     #inputs_to_hidden_units = myCrossProd(weights$embed_to_hid, embedding_layer_state) + fn(weights$hid_bias, 1, batchsize)     
+     inputs_to_hidden_units <- t(embed_to_hid_weights) %*% embedding_layer_state + repmat(hid_bias, 1, batchsize)
+     
+     
+     #      benchmark(
+     #           tmp1 <- repmat(weights$hid_bias, 1, batchsize),
+     #           tmp2 <- myRepMat4(weights$hid_bias, 1, batchsize),
+     #           replications=10
+     #           )
+     
+     
+     #      % Apply logistic activation function.
+     #      % FILL IN CODE. Replace the line below by one of the options.
+     #       hidden_layer_state = zeros(numhid2, batchsize)
+     #      % Options
+     #      % (a) hidden_layer_state = 1 ./ (1 + exp(inputs_to_hidden_units));
+     #      % (b) hidden_layer_state = 1 ./ (1 - exp(-inputs_to_hidden_units));
+     hidden_layer_state = 1 / (1 + exp(-inputs_to_hidden_units))
+     #      % (d) hidden_layer_state = -1 ./ (1 + exp(-inputs_to_hidden_units));
+     
+     #      %% COMPUTE STATE OF OUTPUT LAYER.
+     #      % Compute inputs to softmax.
+     #      % FILL IN CODE. Replace the line below by one of the options.
+     #       inputs_to_softmax = zeros(vocab_size, batchsize)
+     #      % Options
+     #inputs_to_softmax = myCrossProd(weights$hid_to_output, hidden_layer_state) +  fn(weights$output_bias, 1, batchsize)
+     inputs_to_softmax <- t(hid_to_output_weights) %*% hidden_layer_state + repmat(output_bias, 1, batchsize)
+     
+     #      % (b) inputs_to_softmax = t(hid_to_output_weights) %*% hidden_layer_state +  repmat(output_bias, batchsize, 1);
+     #      % (c) inputs_to_softmax = hidden_layer_state %*% t(hid_to_output_weights) +  repmat(output_bias, 1, batchsize);
+     #      % (d) inputs_to_softmax = hid_to_output_weights %*% hidden_layer_state +  repmat(output_bias, batchsize, 1);
+     
+     #      % Subtract maximum. 
+     #      % Remember that adding or subtracting the same constant from each input to a
+     #      % softmax unit does not affect the outputs. Here we are subtracting maximum to
+     #      % make all inputs <= 0. This prevents overflows when computing their
+     #      % exponents.
      # max in matlab returns max from each column by default
-#      benchmark(
-#      tmp <- apply(inputs_to_softmax, 2, max),
-#      tmp2 <- colMaxs(inputs_to_softmax),
-#      replications=10)
-     tmp <- apply(inputs_to_softmax, 2, max)
-     inputs_to_softmax = inputs_to_softmax - fn(tmp, vocab_size, 1)
-     
-#      % Compute exp.
+     #      benchmark(
+     #      tmp <- apply(inputs_to_softmax, 2, max),
+     #      tmp2 <- colMaxs(inputs_to_softmax),
+     #      replications=10)
+     #tmp <- apply(inputs_to_softmax, 2, max)
+     #inputs_to_softmax = inputs_to_softmax - fn(tmp, vocab_size, 1)
+     inputs_to_softmax <- inputs_to_softmax - repmat(colMaxs(inputs_to_softmax), vocab_size, 1)
+     
+     #      % Compute exp.
      output_layer_state = exp(inputs_to_softmax)
      
-#      % Normalize to get probability distribution.
-     output_layer_state = output_layer_state / fn(colSums(output_layer_state), vocab_size, 1)
+     #      % Normalize to get probability distribution.
+     #output_layer_state = output_layer_state / fn(colSums(output_layer_state), vocab_size, 1)
+     output_layer_state <- output_layer_state / repmat(matlab::sum(output_layer_state), vocab_size, 1)
      
      return(list(embedding_layer_state=embedding_layer_state, 
                  hidden_layer_state=hidden_layer_state, 
diff --git a/Neural Net Language Model/FProp_Original.R b/Neural Net Language Model/FProp_Original.R
index 7186a24..794c8d5 100644
--- a/Neural Net Language Model/FProp_Original.R	
+++ b/Neural Net Language Model/FProp_Original.R	
@@ -5,7 +5,7 @@
 library(R.matlab)
 library(matrixStats)
 
-fprop <- function(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) {
+fprop_original <- function(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) {
      #      % This method forward propagates through a neural network.
      #      % Inputs:
      #      %   input_batch: The input data as a matrix of size numwords X batchsize where,
diff --git a/Neural Net Language Model/LoadData.R b/Neural Net Language Model/LoadData.R
index 716e7c4..52482de 100644
--- a/Neural Net Language Model/LoadData.R	
+++ b/Neural Net Language Model/LoadData.R	
@@ -1,6 +1,9 @@
+# Load Data Original, with minor changes to accomodate R
+
 library(R.matlab)
 # library(Matrix)
 library(parallel); options(mc.cores = 4)
+library(itertools)
 
 load_data <- function(N) {
      # % This method loads the training, validation and test set.
@@ -24,32 +27,18 @@ load_data <- function(N) {
                   vocab = unlist(data.mat$data[4,1,1])
      )
      
-     numdims = nrow(data$trainData)
+     numdims = size(data$trainData, 1)
      D = numdims - 1  # subtract 1 because 1:D is the number of input words and D is the predicted word
-     M = floor(ncol(data$trainData) / N)
+     M = floor(size(data$trainData, 2) / N)
      
      # shift to an list of M minibatches, each with D*N
      # looks like we threw out the remainder training data
-     splitMatrixIntoBatch <- function(dat, b, N, byCol=TRUE) {
-          # N is the size of each batch
-          # b is the requested batch
-          if(length(dim(dat)) == 0) {
-               if(byCol) dim(dat) <- c(1, length(dat)) else dim(dat) <- c(length(dat), 1)               
-          }
-          start <- ((b - 1) * N) + 1
-          end <- b * N
-          
-          if(byCol) return(dat[,start:end]) else return(dat[start:end,])
-     } 
-     train_input <- mclapply(1:M, splitMatrixIntoBatch, N=N, dat=data$trainData[1:D,], byCol=TRUE)
-     train_target <- mclapply(1:M, splitMatrixIntoBatch, N=N, dat=data$trainData[D+1,], byCol=TRUE)
-     
-     valid_input <- (data$validData[1:D,])
-     valid_target <- data$validData[D + 1,] 
-     
-     test_input <- (data$testData[1:D,])
-     test_target <- data$testData[D + 1,]
-     
+     train_input <- data$trainData[1:D, 1:(N*M), drop=F]
+     train_target <- data$trainData[D + 1, 1:(N*M), drop=F]
+     valid_input <- data$validData[1:D,, drop=F]
+     valid_target <- data$validData[D + 1, , drop=F]
+     test_input <- data$validData[1:D, , drop=F]
+     test_target <- data$testData[D + 1, , drop=F]
      vocab <- data$vocab
      
      return(list(train_input=train_input,
@@ -59,4 +48,70 @@ load_data <- function(N) {
                  test_input=test_input,
                  test_target=test_target,
                  vocab=vocab))
-}
\ No newline at end of file
+}
+
+# faster to access lists than indices in an array
+# but slower to create a set of lists than to reshape the training data into an array
+
+
+
+# train_input3 <- vector(M, "list")
+# fn <- function() {
+#      it <- isplitCols(data$trainData[1:D, 1:(N*M)], chunks = M)
+#      replicate(M, nextElem(it), simplify=F)
+# }
+# 
+# fn2 <- function() {
+#      enumerate(isplitCols(data$trainData[1:D, 1:(N*M)], chunks = M))   
+# }
+#      
+# myReshape <- function(A, ...) {
+#      if (!is.array(A)) {
+#           stop(sprintf("argument %s must be matrix or array", sQuote("A")))
+#      }
+#      nargs <- length(dots <- list(...))
+#      dims <- as.integer(if (nargs == 1 && matlab:::is.size_t(dots[[1]])) {
+#           dots[[1]]
+#      } else {
+#           unlist(dots)
+#      })
+#      if (!(length(dims) > 1)) {
+#           stop("dimensions must be of length greater than 1")
+#      }
+#      else if (!(all(dims > 0))) {
+#           stop("dimensions must be a positive quantity")
+#      }
+#      else if (prod(dims) != prod(dim(A))) {
+#           stop("number of elements must not change")
+#      }
+#      dim(A) <- dims
+#      
+#      return(A)
+#  
+# }
+# 
+# 
+# 
+#      
+# benchmark(
+#      train_input1 <- reshape(data$trainData[1:D, 1:(N*M), drop=F], D, N, M),
+#      train_input2 <- lapply(1:M, splitMatrixIntoBatch, N=N, dat=data$trainData[1:D,], byCol=TRUE),
+#      train_input3 <- fn(),
+#      train_input4 <- myReshape(data$trainData[1:D, 1:(N*M), drop=F], D, N, M), # close second
+#      train_input5 <- fn2(), # fastest
+#      replications=10
+#      )
+# # 
+# 
+# benchmark(
+#      data1 <- load_data_original(batchsize),
+#      data2 <- load_data(batchsize),
+#      
+#      replications <- 10
+#      )
+# 
+# benchmark(
+#      for(m in 1:372) tmp1 <- data1$train_input[,,m],
+#      for(m in 1:372) tmp2 <- data2$train_input[[m]],
+#      replications <- 2
+#           )
diff --git a/Neural Net Language Model/LoadData_Original.R b/Neural Net Language Model/LoadData_Original.R
index f732903..4eee335 100644
--- a/Neural Net Language Model/LoadData_Original.R	
+++ b/Neural Net Language Model/LoadData_Original.R	
@@ -4,7 +4,7 @@ library(R.matlab)
 # library(Matrix)
 #library(parallel); options(mc.cores = 4)
 
-load_data <- function(N) {
+load_data_original <- function(N) {
      # % This method loads the training, validation and test set.
      # % It also divides the training set into mini-batches.
      # % Inputs:
diff --git a/Neural Net Language Model/Train.R b/Neural Net Language Model/Train.R
index 1fb2d29..7d3f52a 100644
--- a/Neural Net Language Model/Train.R	
+++ b/Neural Net Language Model/Train.R	
@@ -1,10 +1,12 @@
+# Original training file, with minor modifications to run in R
+
 # setwd("~/R_Projects/Neural Networks for Machine Learning")
-library(gputools)
-library(Matrix)
-library(parallel); options(mc.cores=4)
-library(R.utils)
+#library(gputools)
+#library(Matrix)
+#library(parallel); options(mc.cores=4)
+# library(R.utils)
 
-source("MatlabFunctions.R")
+source("Neural Net Language Model/MatlabFunctions.R")
 source("Neural Net Language Model/LoadData.R")
 source("Neural Net Language Model/FProp.R")
 
@@ -13,14 +15,14 @@ source("Neural Net Language Model/FProp.R")
 
 # % This function trains a neural network language model.
 train <- function(epochs) {
-#      % Inputs:
-#      %   epochs: Number of epochs to run.
-#      % Output:
-#      %   model: A struct containing the learned weights and biases and vocabulary.
+     #      % Inputs:
+     #      %   epochs: Number of epochs to run.
+     #      % Output:
+     #      %   model: A struct containing the learned weights and biases and vocabulary.
      
      start_time <- proc.time() 
      
-#      % SET HYPERPARAMETERS HERE.
+     #      % SET HYPERPARAMETERS HERE.
      batchsize = 1000  # Mini-batch size; default = 100.
      learning_rate = 0.1  # Learning rate; default = 0.1.
      momentum = 0.9  # Momentum; default = 0.9.
@@ -28,89 +30,76 @@ train <- function(epochs) {
      numhid2 = 200  # Number of units in hidden layer; default = 200.
      init_wt = 0.01  # Standard deviation of the normal distribution, which is sampled to get the initial weights; default = 0.01
      
-#      % VARIABLES FOR TRACKING TRAINING PROGRESS.
+     #      % VARIABLES FOR TRACKING TRAINING PROGRESS.
      # number is the number of batches to run before showing training & validation
      show_training_CE_after = 100
-     show_validation_CE_after = 100
+     show_validation_CE_after = 1000
      
-#      % LOAD DATA.
+     #      % LOAD DATA.
      data <- load_data(batchsize)
-     tmp <- data$train_input[[1]]
-     numwords <- nrow(tmp)
-     batchsize <- ncol(tmp)
-     numbatches <- length(data$train_input)
-     vocab_size <- length(data$vocab)  
-
-#      % INITIALIZE WEIGHTS AND BIASES.    
-     d1 <- list(word_embedding=vocab_size,
-                embed_to_hid=numwords * numhid1,
-                hid_to_output=numhid2,
-                hid_bias=numhid2,
-                output_bias=vocab_size)
-     d2 <- list(word_embedding=numhid1,
-                embed_to_hid=numhid2,
-                hid_to_output=vocab_size,
-                hid_bias=1,
-                output_bias=1)
+     tmp <- size(data$train_input)
+     numwords <- tmp[1]
+     #batchsize <- tmp[2]
+     #numbatches <- tmp[3]
+     vocab_size <- size(data$vocab, 2)  
      
-    
-     weights <- c(mapply(function(x, y, init_wt) init_wt*randn(x,y), x=d1[1:3], y=d2[1:3], MoreArgs=list(init_wt=init_wt)),
-                  mapply(sparseVector, length=d1[4:5], MoreArgs=list(x=0), i=0)) # mapply(rep.int, times=d1[4:5], MoreArgs=list(x=0))
-   
-     deltas <- c(mapply(zeros, d1[1:3], d2[1:3]),
-                 mapply(sparseVector, length=d1[4:5], MoreArgs=list(x=0, i=0)))  # mapply(rep.int, times=d1[4:5], MoreArgs=list(x=0))
+     word_embedding_weights = init_wt * randn(vocab_size, numhid1);
+     embed_to_hid_weights = init_wt * randn(numwords * numhid1, numhid2);
+     hid_to_output_weights = init_wt * randn(numhid2, vocab_size);
+     hid_bias = zeros(numhid2, 1);
+     output_bias = zeros(vocab_size, 1);
      
-     gradients <- list(word_embedding=zeros(d1[1], d2[1]),
-                       embed_to_hid=NULL,
-                       hid_to_output=NULL,
-                       hid_bias=NULL,
-                       output_bias=NULL)
-                   
-     expansion_matrix = .symDiagonal(vocab_size)  # expanding the matrix is faster on a Diagonal class; .symDiagonal faster than just Diagonal here (b/c we are indexing it)
-     #expansion_matrix = eye(vocab_size)
+     word_embedding_weights_delta = zeros(vocab_size, numhid1);
+     word_embedding_weights_gradient = zeros(vocab_size, numhid1);
+     embed_to_hid_weights_delta = zeros(numwords * numhid1, numhid2);
+     hid_to_output_weights_delta = zeros(numhid2, vocab_size);
+     hid_bias_delta = zeros(numhid2, 1);
+     output_bias_delta = zeros(vocab_size, 1);
+     expansion_matrix = eye(vocab_size);
      count = 0
      tiny = exp(-30)
-     
-     datasetsize = ncol(data$valid_input)
-     #datasetsize = 
-     expanded_valid_target = expansion_matrix[, as.integer(data$valid_target)]
-     
-
-     
-     
-#      % TRAIN.
+         
+     #      % TRAIN.
      for(epoch in 1:epochs) {
           myPrintf('Epoch %d\n', epoch)
           this_chunk_CE <- 0
           trainset_CE <- 0
           
-#           % LOOP OVER MINI-BATCHES.
-          for(m in 1:numbatches) {
-               
-               input_batch <- data$train_input[[m]]
-               target_batch <- data$train_target[[m]]
-#                % FORWARD PROPAGATE.
-#                % Compute the state of each layer in the network given the input batch and all weights and biases
+          inputIT <- ihasNext(isplitCols(data$train_input, chunkSize=batchsize))
+          targetIT <- ihasNext(isplitCols(data$train_target, chunkSize=batchsize))
+          m <- 0
+          #           % LOOP OVER MINI-BATCHES.
+          #for(m in 1:numbatches) {
+           while(hasNext(inputIT) & hasNext(targetIT)) {    
+                m <- m + 1
+                
+               #input_batch <- data$train_input[,,m]
+               #target_batch <- data$train_target[,,m]
+               input_batch <- nextElem(inputIT)
+               target_batch <- nextElem(targetIT)
+                
+               #                % FORWARD PROPAGATE.
+               #                % Compute the state of each layer in the network given the input batch and all weights and biases
                # returns the embedding, hidden and output layer states
-               neural_net_states <- fprop(input_batch, weights, repmat)
-               
-#                benchmark(
-#                     neural_net_states <- fprop(input_batch, weights, repmat),
-#                     neural_net_states2 <- fprop(input_batch, weights, myRepMat),
-#                     neural_net_states3 <- fprop(input_batch, weights, myRepMat2),
-#                     neural_net_states3 <- fprop(input_batch, weights, myRepMat3),
-#                     neural_net_states4 <- fprop(input_batch, weights, myRepMat4),
-#                     replications=10
-#                     )
-               
-#                % COMPUTE DERIVATIVE.
-#                %% Expand the target to a sparse 1-of-K vector.
+               neural_net_states <- fprop(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias)
+               
+               #                benchmark(
+               #                     neural_net_states <- fprop(input_batch, weights, repmat),
+               #                     neural_net_states2 <- fprop(input_batch, weights, myRepMat),
+               #                     neural_net_states3 <- fprop(input_batch, weights, myRepMat2),
+               #                     neural_net_states3 <- fprop(input_batch, weights, myRepMat3),
+               #                     neural_net_states4 <- fprop(input_batch, weights, myRepMat4),
+               #                     replications=10
+               #                     )
+               
+               #                % COMPUTE DERIVATIVE.
+               #                %% Expand the target to a sparse 1-of-K vector.
                expanded_target_batch = expansion_matrix[, as.integer(target_batch)]
-#                %% Compute derivative of cross-entropy loss function.
+               #                %% Compute derivative of cross-entropy loss function.
                error_deriv = neural_net_states$output_layer_state - expanded_target_batch
                
-#                % MEASURE LOSS FUNCTION.
-               CE = -sum(colSums(expanded_target_batch * log(neural_net_states$output_layer_state + tiny))) / batchsize
+               #                % MEASURE LOSS FUNCTION.
+               CE = -matlab::sum(matlab::sum(expanded_target_batch * log(neural_net_states$output_layer_state + tiny))) / batchsize
                count =  count + 1
                this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count
                trainset_CE = trainset_CE + (CE - trainset_CE) / m
@@ -122,79 +111,97 @@ train <- function(epochs) {
                     this_chunk_CE = 0
                }
                
-#                % BACK PROPAGATE.
-#                %% OUTPUT LAYER.
+               #                % BACK PROPAGATE.
+               #                %% OUTPUT LAYER.
                # tcrossprod == x %*% t(y)
-               gradients$hid_to_output =  myTCrossProd(neural_net_states$hidden_layer_state, error_deriv)     
-               gradients$output_bias = rowSums(error_deriv)
-               back_propagated_deriv_1 = myMatMult(weights$hid_to_output,error_deriv) * neural_net_states$hidden_layer_state * (1 - neural_net_states$hidden_layer_state)
-               
-#                %% HIDDEN LAYER.
-#                % FILL IN CODE. Replace the line below by one of the options.
-#                gradients$embed_to_hid = zeros(numhid1 * numwords, numhid2);
-#                % Options:
-#                     % (a) embed_to_hid_weights_gradient = t(back_propagated_deriv_1) %*% embedding_layer_state;
-              gradients$embed_to_hid = myTCrossProd(neural_net_states$embedding_layer_state, back_propagated_deriv_1);
-#                % (c) embed_to_hid_weights_gradient = back_propagated_deriv_1;
-#                % (d) embed_to_hid_weights_gradient = embedding_layer_state;
-               
-#                % FILL IN CODE. Replace the line below by one of the options.
-#                gradients$hid_bias = zeros(numhid2, 1);
-#                % Options
-              gradients$hid_bias = rowSums(back_propagated_deriv_1);
-#                % (b) hid_bias_gradient = apply(back_propagated_deriv_1, 2, sum);
-#                % (c) hid_bias_gradient = back_propagated_deriv_1;
-#                % (d) hid_bias_gradient = back_propagated_deriv_1';
-
-#                % FILL IN CODE. Replace the line below by one of the options.
-#                 back_propagated_deriv_2 = zeros(numhid2, batchsize);
-#                % Options
-              back_propagated_deriv_2 = myMatMult(weights$embed_to_hid, back_propagated_deriv_1);
-#                % (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights;
-#                % (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights;
-#                % (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights';
+               #gradients$hid_to_output =  myTCrossProd(neural_net_states$hidden_layer_state, error_deriv)     
+               #gradients$output_bias = rowSums(error_deriv)
+               #back_propagated_deriv_1 = myMatMult(weights$hid_to_output,error_deriv) * neural_net_states$hidden_layer_state * (1 - neural_net_states$hidden_layer_state)
+               hid_to_output_weights_gradient <- neural_net_states$hidden_layer_state %*% t(error_deriv)
+               output_bias_gradient <- rowSums(error_deriv)
+               back_propagated_deriv_1 <- (hid_to_output_weights %*% error_deriv) * neural_net_states$hidden_layer_state * (1 - neural_net_states$hidden_layer_state)
+               
+               
+               #                %% HIDDEN LAYER.
+               #                % FILL IN CODE. Replace the line below by one of the options.
+               #                gradients$embed_to_hid = zeros(numhid1 * numwords, numhid2);
+               #                % Options:
+               #                     % (a) embed_to_hid_weights_gradient = t(back_propagated_deriv_1) %*% embedding_layer_state;
+               #gradients$embed_to_hid = myTCrossProd(neural_net_states$embedding_layer_state, back_propagated_deriv_1);
+               embed_to_hid_weights_gradient <- neural_net_states$embedding_layer_state %*% t(back_propagated_deriv_1)
+               #                % (c) embed_to_hid_weights_gradient = back_propagated_deriv_1;
+               #                % (d) embed_to_hid_weights_gradient = embedding_layer_state;
+               
+               #                % FILL IN CODE. Replace the line below by one of the options.
+               #                gradients$hid_bias = zeros(numhid2, 1);
+               #                % Options
+               #gradients$hid_bias = rowSums(back_propagated_deriv_1);
+               hid_bias_gradient <- rowSums(back_propagated_deriv_1)
+               #                % (b) hid_bias_gradient = apply(back_propagated_deriv_1, 2, sum);
+               #                % (c) hid_bias_gradient = back_propagated_deriv_1;
+               #                % (d) hid_bias_gradient = back_propagated_deriv_1';
+               
+               #                % FILL IN CODE. Replace the line below by one of the options.
+               #                 back_propagated_deriv_2 = zeros(numhid2, batchsize);
+               #                % Options
+               #back_propagated_deriv_2 = myMatMult(weights$embed_to_hid, back_propagated_deriv_1);
+               back_propagated_deriv_2 <- embed_to_hid_weights %*% back_propagated_deriv_1
+               #                % (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights;
+               #                % (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights;
+               #                % (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights';
                
                #gradients$word_embedding[,] = 0;
-#                %% EMBEDDING LAYER.
-        
-               gradients$word_embedding[,] <- 0
+               #                %% EMBEDDING LAYER.
+               
+               word_embedding_weights_gradient[,] <- 0
                for(w in 1:numwords) {
-                    tmp <- back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ]
-                    gradients$word_embedding= gradients$word_embedding + myTCrossProd(expansion_matrix[, as.integer(input_batch[w, ])], tmp)
+                    #tmp <- back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ]
+                    #gradients$word_embedding= gradients$word_embedding + myTCrossProd(expansion_matrix[, as.integer(input_batch[w, ])], tmp)
+                    word_embedding_weights_gradient <- word_embedding_weights_gradient + 
+                         expansion_matrix[,as.integer(input_batch[w,])] %*% 
+                         t(back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ])
                }
                
-#                system.time({
-#                embeddingFn <- function(w, back_propogated_deriv_2, numhid1, input_batch, expansion_matrix) {
-#                     tmp <- back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ]
-#                     myTCrossProd(expansion_matrix[, input_batch[w, ]], tmp)
-#                }
-#                w <- mclapply(1:numwords, embeddingFn, back_propogated_deriv_2=back_propogated_deriv_2, numhid1=numhid1, input_batch=input_batch, expansion_matrix=expansion_matrix)
-#                for(exMat in w) gradients$word_embedding <- gradients$word_embedding + exMat
-#                })
+             
+               #                % UPDATE WEIGHTS AND BIASES.
+               
+               word_embedding_weights_delta = momentum * word_embedding_weights_delta +
+                    word_embedding_weights_gradient / batchsize;
+               word_embedding_weights = word_embedding_weights - (learning_rate * word_embedding_weights_delta);
+               
+               embed_to_hid_weights_delta = momentum * embed_to_hid_weights_delta + 
+                    embed_to_hid_weights_gradient / batchsize;
+               embed_to_hid_weights = embed_to_hid_weights - (learning_rate * embed_to_hid_weights_delta);
                
-#                % UPDATE WEIGHTS AND BIASES.
+               hid_to_output_weights_delta = momentum * hid_to_output_weights_delta + 
+                    hid_to_output_weights_gradient / batchsize;
+               hid_to_output_weights = hid_to_output_weights - (learning_rate * hid_to_output_weights_delta);
                
-               #deltas <- mapply(delta_update_fn, delta=deltas, gradient=gradients, MoreArgs=list(momentum=momentum, batchsize=batchsize))
-               deltas <- mcmapply(delta_update_fn, delta=deltas, gradient=gradients, MoreArgs=list(momentum=momentum, batchsize=batchsize), mc.cores=5)
+               hid_bias_delta = momentum * hid_bias_delta + 
+                    hid_bias_gradient / batchsize;
+               hid_bias = hid_bias - (learning_rate * hid_bias_delta);
                
-               #weights <- mapply(weights_update_fn, delta=deltas, weight=weights, MoreArgs=list(learning_rate=learning_rate))
-               weights <- mcmapply(weights_update_fn, delta=deltas, weight=weights, MoreArgs=list(learning_rate=learning_rate), mc.cores=5)       
+               output_bias_delta = momentum * output_bias_delta + 
+                    output_bias_gradient / batchsize;
+               output_bias = output_bias - (learning_rate * output_bias_delta);
                
-#                % VALIDATE.
+               #                % VALIDATE.
                if(mod(m, show_validation_CE_after) == 0) {
                     myPrintf('\rRunning validation ...')
-                    neural_net_states <- fprop(data$valid_input, weights, fn=repmat)
-#                     benchmark(
-#                     neural_net_states <- fprop(data$valid_input, weights, fn=repmat),
-#                     neural_net_states2 <- fprop(data$valid_input, weights, fn=myRepMat),
-#                     neural_net_states3 <- fprop(data$valid_input, weights, myRepMat2),
-#                     neural_net_states4 <- fprop(data$valid_input, weights, myRepMat3),
-#                     neural_net_states5 <- fprop(data$valid_input, weights, myRepMat4),
-#                     replications=2
-#                     )
-                   
+                    neural_net_states <- fprop(data$valid_input, word_embedding_weights, embed_to_hid_weights,
+                                               hid_to_output_weights, hid_bias, output_bias)
+                    #                     benchmark(
+                    #                     neural_net_states <- fprop(data$valid_input, weights, fn=repmat),
+                    #                     neural_net_states2 <- fprop(data$valid_input, weights, fn=myRepMat),
+                    #                     neural_net_states3 <- fprop(data$valid_input, weights, myRepMat2),
+                    #                     neural_net_states4 <- fprop(data$valid_input, weights, myRepMat3),
+                    #                     neural_net_states5 <- fprop(data$valid_input, weights, myRepMat4),
+                    #                     replications=2
+                    #                     )
                     
-                    CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize
+                    datasetsize <- size(data$valid_input, 2)
+                    expanded_valid_target <- expansion_matrix[, as.integer(data$valid_target)]
+                    CE = -matlab::sum(matlab::sum(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize
                     myPrintf(' Validation CE %.3f\n', CE)
                }
           }          
@@ -203,31 +210,38 @@ train <- function(epochs) {
      myPrintf('Finished Training.\n')
      myPrintf('Final Training CE %.3f\n', trainset_CE)
      
-#      % EVALUATE ON VALIDATION SET.
+     #      % EVALUATE ON VALIDATION SET.
      myPrintf('\rRunning validation ...')
      
-     neural_net_states <- fprop(data$valid_input, weights, fn=repmat)
-     CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize
+     neural_net_states <- fprop(data$valid_input, word_embedding_weights, embed_to_hid_weights,
+                                hid_to_output_weights, hid_bias, output_bias)
+     datasetsize <- size(data$valid_input, 2)
+     expanded_valid_target <- expansion_matrix[, as.integer(data$valid_target)]
+     CE = -matlab::sum(matlab::sum(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize
      myPrintf('\rFinal Validation CE %.3f\n', CE)
-    
-#      % EVALUATE ON TEST SET.
+     
+     #      % EVALUATE ON TEST SET.
      myPrintf('\rRunning test ...');
      
-     neural_net_states <- fprop(data$test_input, weights, fn=repmat)
-                               
+     neural_net_states <- fprop(data$test_input, word_embedding_weights, embed_to_hid_weights,
+                                hid_to_output_weights, hid_bias, output_bias)
      datasetsize = size(data$valid_input, 2);
      expanded_valid_target = expansion_matrix[, as.integer(data$test_target)];
-     CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) / datasetsize
+     CE = -matlab::sum(matlab::sum(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) / datasetsize
      myPrintf('\rFinal Test CE %.3f\n', CE)
      
      end_time <- proc.time() 
      
      print(end_time - start_time)
-     return(list(weights=weights, vocab=data$vocab))
+     return(list(word_embedding_weights = word_embedding_weights, 
+                 embed_to_hid_weights = embed_to_hid_weights,
+                 hid_to_output_weights = hid_to_output_weights,
+                 hid_bias = hid_bias,
+                 output_bias = output_bias,
+                 vocab=data$vocab))
 }
 
-delta_update_fn <- function(delta, gradient, momentum, batchsize) { momentum * delta + gradient / batchsize}
-weights_update_fn <- function(delta, weight, learning_rate) {weight - learning_rate * delta}
+
 
 # Rprof()
 # model <- train(1)
diff --git a/Neural Net Language Model/Train_Original.R b/Neural Net Language Model/Train_Original.R
index 74cbe31..4f1ad04 100644
--- a/Neural Net Language Model/Train_Original.R	
+++ b/Neural Net Language Model/Train_Original.R	
@@ -14,7 +14,7 @@ source("Neural Net Language Model/FProp_Original.R")
 # see also http://mathesaurus.sourceforge.net/octave-r.html
 
 # % This function trains a neural network language model.
-train <- function(epochs) {
+train_original <- function(epochs) {
      #      % Inputs:
      #      %   epochs: Number of epochs to run.
      #      % Output:
@@ -36,7 +36,7 @@ train <- function(epochs) {
      show_validation_CE_after = 1000
      
      #      % LOAD DATA.
-     data <- load_data(batchsize)
+     data <- load_data_original(batchsize)
      tmp <- size(data$train_input)
      numwords <- tmp[1]
      batchsize <- tmp[2]
@@ -73,14 +73,14 @@ train <- function(epochs) {
                #                % FORWARD PROPAGATE.
                #                % Compute the state of each layer in the network given the input batch and all weights and biases
                # returns the embedding, hidden and output layer states
-               neural_net_states <- fprop(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias)
+               neural_net_states <- fprop_original(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias)
                
                #                benchmark(
-               #                     neural_net_states <- fprop(input_batch, weights, repmat),
-               #                     neural_net_states2 <- fprop(input_batch, weights, myRepMat),
-               #                     neural_net_states3 <- fprop(input_batch, weights, myRepMat2),
-               #                     neural_net_states3 <- fprop(input_batch, weights, myRepMat3),
-               #                     neural_net_states4 <- fprop(input_batch, weights, myRepMat4),
+               #                     neural_net_states <- fprop_original(input_batch, weights, repmat),
+               #                     neural_net_states2 <- fprop_original(input_batch, weights, myRepMat),
+               #                     neural_net_states3 <- fprop_original(input_batch, weights, myRepMat2),
+               #                     neural_net_states3 <- fprop_original(input_batch, weights, myRepMat3),
+               #                     neural_net_states4 <- fprop_original(input_batch, weights, myRepMat4),
                #                     replications=10
                #                     )
                
@@ -180,14 +180,14 @@ train <- function(epochs) {
                #                % VALIDATE.
                if(mod(m, show_validation_CE_after) == 0) {
                     myPrintf('\rRunning validation ...')
-                    neural_net_states <- fprop(data$valid_input, word_embedding_weights, embed_to_hid_weights,
+                    neural_net_states <- fprop_original(data$valid_input, word_embedding_weights, embed_to_hid_weights,
                                                hid_to_output_weights, hid_bias, output_bias)
                     #                     benchmark(
-                    #                     neural_net_states <- fprop(data$valid_input, weights, fn=repmat),
-                    #                     neural_net_states2 <- fprop(data$valid_input, weights, fn=myRepMat),
-                    #                     neural_net_states3 <- fprop(data$valid_input, weights, myRepMat2),
-                    #                     neural_net_states4 <- fprop(data$valid_input, weights, myRepMat3),
-                    #                     neural_net_states5 <- fprop(data$valid_input, weights, myRepMat4),
+                    #                     neural_net_states <- fprop_original(data$valid_input, weights, fn=repmat),
+                    #                     neural_net_states2 <- fprop_original(data$valid_input, weights, fn=myRepMat),
+                    #                     neural_net_states3 <- fprop_original(data$valid_input, weights, myRepMat2),
+                    #                     neural_net_states4 <- fprop_original(data$valid_input, weights, myRepMat3),
+                    #                     neural_net_states5 <- fprop_original(data$valid_input, weights, myRepMat4),
                     #                     replications=2
                     #                     )
                     
@@ -205,7 +205,7 @@ train <- function(epochs) {
      #      % EVALUATE ON VALIDATION SET.
      myPrintf('\rRunning validation ...')
      
-     neural_net_states <- fprop(data$valid_input, word_embedding_weights, embed_to_hid_weights,
+     neural_net_states <- fprop_original(data$valid_input, word_embedding_weights, embed_to_hid_weights,
                                 hid_to_output_weights, hid_bias, output_bias)
      datasetsize <- size(data$valid_input, 2)
      expanded_valid_target <- expansion_matrix[, as.integer(data$valid_target)]
@@ -215,7 +215,7 @@ train <- function(epochs) {
      #      % EVALUATE ON TEST SET.
      myPrintf('\rRunning test ...');
      
-     neural_net_states <- fprop(data$test_input, word_embedding_weights, embed_to_hid_weights,
+     neural_net_states <- fprop_original(data$test_input, word_embedding_weights, embed_to_hid_weights,
                                 hid_to_output_weights, hid_bias, output_bias)
      datasetsize = size(data$valid_input, 2);
      expanded_valid_target = expansion_matrix[, as.integer(data$test_target)];