From 1142c7d929aa52405702768c79cffc9b3977b227 Mon Sep 17 00:00:00 2001
From: Michael <caewok_2000@yahoo.com>
Date: Wed, 30 Jan 2013 11:26:14 -0800
Subject: [PATCH] Inclusion of the prior R code for training. (labeled "Old")

---
 Neural Net Language Model/FProp_Old.R         | 112 ++++++
 Neural Net Language Model/LoadData_Old.R      |  62 ++++
 Neural Net Language Model/MatlabFunctions.R   |  12 +
 .../MatlabFunctions_Old.R                     | 332 ++++++++++++++++++
 Neural Net Language Model/Train_Old.R         | 235 +++++++++++++
 5 files changed, 753 insertions(+)
 create mode 100644 Neural Net Language Model/FProp_Old.R
 create mode 100644 Neural Net Language Model/LoadData_Old.R
 create mode 100644 Neural Net Language Model/MatlabFunctions.R
 create mode 100644 Neural Net Language Model/MatlabFunctions_Old.R
 create mode 100644 Neural Net Language Model/Train_Old.R

diff --git a/Neural Net Language Model/FProp_Old.R b/Neural Net Language Model/FProp_Old.R
new file mode 100644
index 0000000..31d145e
--- /dev/null
+++ b/Neural Net Language Model/FProp_Old.R	
@@ -0,0 +1,112 @@
+# library(Matrix)
+#library(matrixStats)
+
+fprop <- function(input_batch, weights, fn) {
+#      % This method forward propagates through a neural network.
+#      % Inputs:
+#      %   input_batch: The input data as a matrix of size numwords X batchsize where,
+#      %     numwords is the number of words, batchsize is the number of data points.
+#      %     So, if input_batch(i, j) = k then the ith word in data point j is word
+#      %     index k of the vocabulary.
+#      %
+#      %   word_embedding_weights: Word embedding as a matrix of size
+#      %     vocab_size X numhid1, where vocab_size is the size of the vocabulary
+#      %     numhid1 is the dimensionality of the embedding space.
+#      %
+#      %   embed_to_hid_weights: Weights between the word embedding layer and hidden
+#      %     layer as a matrix of size numhid1*numwords X numhid2, numhid2 is the
+#      %     number of hidden units.
+#      %
+#      %   hid_to_output_weights: Weights between the hidden layer and output softmax
+#      %               unit as a matrix of size numhid2 X vocab_size
+#      %
+#      %   hid_bias: Bias of the hidden layer as a matrix of size numhid2 X 1.
+#      %
+#      %   output_bias: Bias of the output layer as a matrix of size vocab_size X 1.
+#      %
+#      % Outputs:
+#      %   embedding_layer_state: State of units in the embedding layer as a matrix of
+#      %     size numhid1*numwords X batchsize
+#      %
+#      %   hidden_layer_state: State of units in the hidden layer as a matrix of size
+#      %     numhid2 X batchsize
+#      %
+#      %   output_layer_state: State of units in the output layer as a matrix of size
+#      %     vocab_size X batchsize
+#      %
+     
+     tmp <- dim(input_batch)  # basically dim
+     numwords <- tmp[1]
+     batchsize <- tmp[2]
+     
+     tmp <- dim(weights$word_embedding)  # basically dim
+     vocab_size <- tmp[1]
+     numhid1 <- tmp[2]
+     
+     numhid2 <- ncol(weights$embed_to_hid)
+
+#      %% COMPUTE STATE OF WORD EMBEDDING LAYER.
+#      % Look up the inputs word indices in the word_embedding_weights matrix.
+     # each row of the word weights corresponds to a word (250 total)
+     # input_batch contains 300 total words (3 * 100 batchsize)
+     # each element of input_batch is a number between 1 and 249 (250?), corresponding to a word
+     #embedding_layer_state2 <- matrix(as.numeric(weights$word_embedding[as.integer(input_batch), ]), nrow=numhid1 * numwords)
+     embedding_layer_state <- myReshape(weights$word_embedding[as.integer(input_batch),], nrows=numhid1 * numwords)    
+     
+     #embedding_layer_state2 <- matlab::reshape(as.matrix(weights$word_embedding[as.numeric(input_batch), ]), numhid1*numwords, 100)
+                                
+#      %% COMPUTE STATE OF HIDDEN LAYER.
+#      % Compute inputs to hidden units.
+     # crossprod = t(x) %*% y
+     inputs_to_hidden_units = myCrossProd(weights$embed_to_hid, embedding_layer_state) + fn(weights$hid_bias, 1, batchsize)     
+     
+#      benchmark(
+#           tmp1 <- repmat(weights$hid_bias, 1, batchsize),
+#           tmp2 <- myRepMat4(weights$hid_bias, 1, batchsize),
+#           replications=10
+#           )
+     
+     
+#      % Apply logistic activation function.
+#      % FILL IN CODE. Replace the line below by one of the options.
+#       hidden_layer_state = zeros(numhid2, batchsize)
+#      % Options
+#      % (a) hidden_layer_state = 1 ./ (1 + exp(inputs_to_hidden_units));
+#      % (b) hidden_layer_state = 1 ./ (1 - exp(-inputs_to_hidden_units));
+    hidden_layer_state = 1 / (1 + exp(-inputs_to_hidden_units))
+#      % (d) hidden_layer_state = -1 ./ (1 + exp(-inputs_to_hidden_units));
+     
+#      %% COMPUTE STATE OF OUTPUT LAYER.
+#      % Compute inputs to softmax.
+#      % FILL IN CODE. Replace the line below by one of the options.
+#       inputs_to_softmax = zeros(vocab_size, batchsize)
+#      % Options
+    inputs_to_softmax = myCrossProd(weights$hid_to_output, hidden_layer_state) +  fn(weights$output_bias, 1, batchsize)
+     
+#      % (b) inputs_to_softmax = t(hid_to_output_weights) %*% hidden_layer_state +  repmat(output_bias, batchsize, 1);
+#      % (c) inputs_to_softmax = hidden_layer_state %*% t(hid_to_output_weights) +  repmat(output_bias, 1, batchsize);
+#      % (d) inputs_to_softmax = hid_to_output_weights %*% hidden_layer_state +  repmat(output_bias, batchsize, 1);
+
+#      % Subtract maximum. 
+#      % Remember that adding or subtracting the same constant from each input to a
+#      % softmax unit does not affect the outputs. Here we are subtracting maximum to
+#      % make all inputs <= 0. This prevents overflows when computing their
+#      % exponents.
+     # max in matlab returns max from each column by default
+#      benchmark(
+#      tmp <- apply(inputs_to_softmax, 2, max),
+#      tmp2 <- colMaxs(inputs_to_softmax),
+#      replications=10)
+     tmp <- apply(inputs_to_softmax, 2, max)
+     inputs_to_softmax = inputs_to_softmax - fn(tmp, vocab_size, 1)
+     
+#      % Compute exp.
+     output_layer_state = exp(inputs_to_softmax)
+     
+#      % Normalize to get probability distribution.
+     output_layer_state = output_layer_state / fn(colSums(output_layer_state), vocab_size, 1)
+     
+     return(list(embedding_layer_state=embedding_layer_state, 
+                 hidden_layer_state=hidden_layer_state, 
+                 output_layer_state=output_layer_state))
+}
\ No newline at end of file
diff --git a/Neural Net Language Model/LoadData_Old.R b/Neural Net Language Model/LoadData_Old.R
new file mode 100644
index 0000000..716e7c4
--- /dev/null
+++ b/Neural Net Language Model/LoadData_Old.R	
@@ -0,0 +1,62 @@
+library(R.matlab)
+# library(Matrix)
+library(parallel); options(mc.cores = 4)
+
+load_data <- function(N) {
+     # % This method loads the training, validation and test set.
+     # % It also divides the training set into mini-batches.
+     # % Inputs:
+     # %   N: Mini-batch size.
+     # % Outputs:
+     # %   train_input: An array of size D X N X M, where
+     # %                 D: number of input dimensions (in this case, 3).
+     # %                 N: size of each mini-batch (in this case, 100).
+     # %                 M: number of minibatches.
+     # %   train_target: An array of size 1 X N X M.
+     # %   valid_input: An array of size D X number of points in the validation set.
+     # %   test: An array of size D X number of points in the test set.
+     # %   vocab: Vocabulary containing index to word mapping.
+     
+     data.mat <- readMat("Neural Net Language Model/data.mat")
+     data <- list(testData = (data.mat$data[1,1,1][[1]]),
+                  trainData = (data.mat$data[2,1,1][[1]]),
+                  validData = (data.mat$data[3,1,1][[1]]),
+                  vocab = unlist(data.mat$data[4,1,1])
+     )
+     
+     numdims = nrow(data$trainData)
+     D = numdims - 1  # subtract 1 because 1:D is the number of input words and D is the predicted word
+     M = floor(ncol(data$trainData) / N)
+     
+     # shift to an list of M minibatches, each with D*N
+     # looks like we threw out the remainder training data
+     splitMatrixIntoBatch <- function(dat, b, N, byCol=TRUE) {
+          # N is the size of each batch
+          # b is the requested batch
+          if(length(dim(dat)) == 0) {
+               if(byCol) dim(dat) <- c(1, length(dat)) else dim(dat) <- c(length(dat), 1)               
+          }
+          start <- ((b - 1) * N) + 1
+          end <- b * N
+          
+          if(byCol) return(dat[,start:end]) else return(dat[start:end,])
+     } 
+     train_input <- mclapply(1:M, splitMatrixIntoBatch, N=N, dat=data$trainData[1:D,], byCol=TRUE)
+     train_target <- mclapply(1:M, splitMatrixIntoBatch, N=N, dat=data$trainData[D+1,], byCol=TRUE)
+     
+     valid_input <- (data$validData[1:D,])
+     valid_target <- data$validData[D + 1,] 
+     
+     test_input <- (data$testData[1:D,])
+     test_target <- data$testData[D + 1,]
+     
+     vocab <- data$vocab
+     
+     return(list(train_input=train_input,
+                 train_target=train_target,
+                 valid_input=valid_input,
+                 valid_target=valid_target,
+                 test_input=test_input,
+                 test_target=test_target,
+                 vocab=vocab))
+}
\ No newline at end of file
diff --git a/Neural Net Language Model/MatlabFunctions.R b/Neural Net Language Model/MatlabFunctions.R
new file mode 100644
index 0000000..11f3e0c
--- /dev/null
+++ b/Neural Net Language Model/MatlabFunctions.R	
@@ -0,0 +1,12 @@
+# Necessary matlab functions
+
+# Matlab helper functions
+# Functions to replicate various matlab commands
+
+# most are found in the matlab library
+library(matlab)
+
+# HELPER FUNCTIONS
+randn <- function(x, y) matrix(data=rnorm(x*y), nrow=x, ncol=y)
+myPrintf <- function(txt, ...) writeLines(sprintf(txt, ...), sep="", con=stdout(), useBytes=TRUE)
+
diff --git a/Neural Net Language Model/MatlabFunctions_Old.R b/Neural Net Language Model/MatlabFunctions_Old.R
new file mode 100644
index 0000000..a92f7ab
--- /dev/null
+++ b/Neural Net Language Model/MatlabFunctions_Old.R	
@@ -0,0 +1,332 @@
+# Matlab helper functions
+# Functions to replicate various matlab commands
+
+# most are found in the matlab library
+library(matlab)
+library(Matrix)
+
+# HELPER FUNCTIONS
+randn <- function(x, y) matrix(data=rnorm(x*y), nrow=x, ncol=y)
+myPrintf <- function(txt, ...) writeLines(sprintf(txt, ...), sep="", con=stdout(), useBytes=TRUE)
+myReshape <- function(mat, nrows, ncols) {
+     if(missing(ncols)) {
+          dim(mat) <- c(nrows, floor(length(mat) / nrows))
+     } else {
+          dim(mat) <- c(floor(length(mat) / ncols), ncols)
+     }
+     return(mat)
+}
+     
+myRepMat <- function(A, nrows, ncols) {
+     # only works for numeric matrices with set number of rows & columns
+     stopifnot(nrows > 0 & ncols > 0)
+     if(is.null(dim(A))) A <- Matrix(A, ncol=1, nrow = length(A))
+     
+     if(missing(ncols)) ncols <- nrows
+     if(nrows == 1 & ncols == 1) return(A)     
+     if(ncols == 1) {
+          B <- (t(myMatMult(A, create1Matrix(nrows))))
+     } else if(nrows == 1) {
+          B <- (myMatMult(A, create1Matrix(ncols)))  
+     } else {
+     #return(kronecker(matrix(1, nrow=nrows, ncol=ncols), A))
+          B <- (myMatMult(A, matrix(1, nrow=nrows, ncol=ncols)))
+     }
+     #res <- (all.equal(B, repmat(A, nrows, ncols)))
+     #if(res != TRUE) warning(res)
+     
+     return(B)
+     
+}
+
+# tmp <- A
+# benchmark(
+#      dim(tmp) <- c(length(A), 1),
+#      tmp <- Matrix(A, ncol=1, nrow=length(A)),
+#      replications=100
+#      )
+
+myRepMat2 <- function(A, nrows, ncols) {
+     # only works for numeric matrices with set number of rows & columns
+     stopifnot(nrows > 0 & ncols > 0)
+     if(is.null(dim(A))) A <- Matrix(A, ncol=1, nrow = length(A))
+     
+     if(missing(ncols)) ncols <- nrows
+     if(nrows == 1 & ncols == 1) return(A)     
+     if(ncols == 1) {
+          B <- (t(kronecker(create1Matrix(nrows), A)))
+     } else if(nrows == 1) {
+          B <- (kronecker(create1Matrix(ncols), A))  
+     } else {
+          #return(kronecker(matrix(1, nrow=nrows, ncol=ncols), A))
+          B <- (kronecker(matrix(1, nrow=nrows, ncol=ncols), A))
+     }
+     #res <- (all.equal(B, repmat(A, nrows, ncols)))
+     #if(res != TRUE) warning(res)
+     
+     return(B)
+}
+
+myRepMat3 <- function(A, nrows, ncols) {
+     # only works for numeric matrices with set number of rows & columns
+     stopifnot(nrows > 0 & ncols > 0)
+     if(is.null(dim(A))) A <- Matrix(A, ncol=1, nrow = length(A))
+     
+     if(missing(ncols)) ncols <- nrows
+     if(nrows == 1 & ncols == 1) return(A)     
+     if(ncols == 1) {
+          B <- (t((A %*% create1Matrix(nrows))))
+     } else if(nrows == 1) {
+          B <- ((A %*% create1Matrix(ncols)))  
+     } else {
+          #return(kronecker(matrix(1, nrow=nrows, ncol=ncols), A))
+          B <- ((A %*% matrix(1, nrow=nrows, ncol=ncols)))
+     }
+     #res <- (all.equal(B, repmat(A, nrows, ncols)))
+     #if(res != TRUE) warning(res)
+     
+     return(B)
+     
+}
+
+myRepMat4 <- function(A, nrows, ncols) {
+     stopifnot(nrows > 0 & ncols > 0)
+     
+     
+     if(missing(ncols)) ncols <- nrows
+     if(nrows == 1 & ncols == 1) return(A)     
+     if(ncols == 1) {
+          B <- Matrix(A, nrow=nrow, ncol=length(A), byrow=FALSE)
+               
+          # B <- (t((A %*% create1Matrix(nrows))))
+     } else if(nrows == 1) {
+          B <- Matrix(A, nrow=length(A), ncol=ncols, byrow=TRUE)
+          # B <- ((A %*% create1Matrix(ncols)))  
+     } else {
+          #return(kronecker(matrix(1, nrow=nrows, ncol=ncols), A))
+          #if(is.null(dim(A))) A <- Matrix(A, ncol=1, nrow = length(A))
+          
+          B <- ((A %*% matrix(1, nrow=nrows, ncol=ncols)))
+     }
+     #res <- (all.equal(B, repmat(A, nrows, ncols)))
+     #if(res != TRUE) warning(res)
+     
+     return(B)
+     
+     
+     
+}
+
+create1Matrix <- function(times) {
+     tmp <- rep.int(1, times=times)
+     dim(tmp) <- c(1, times)
+     return(tmp)
+}
+
+benchmark(
+     tmp <- create1Matrix(1000),
+     tmp2 <- Matrix(1, 1, 1000),
+     replications=100
+     
+     )
+
+
+# benchmark(
+# b1 <- matrix(1, nrow=1, ncol=46568),
+# b2 <- Matrix(1, nrow=1, ncol=46568),
+# b3 <- fn(46568),
+# replications=1000)
+
+# 
+# benchmark(
+#      tmp1 <- repmat(A, nrows, ncols),
+#      tmp2 <- myRepMat(A, nrows, ncols),
+#      replications = 100
+#      
+#      )
+
+
+# if("gputools" %in% .packages()) {
+#      myCrossProd <- gpuCrossprod
+#      myTCrossProd <- gpuTcrossprod
+#      myMatMult <- gpuMatMult
+# } else {
+#      myCrossProd <- crossprod
+#      myTCrossProd <- tcrossprod
+#      myMatMult <- function(x, y) x %*% y
+# }
+
+USE_GPU <- "gputools" %in% .packages()
+SPARSE_MATRIX_CLASSES <- c("dsCMatrix", "ddiMatrix", "dtCMatrix", "dtTMatrix", "dgTMatrix", "dtRMatrix", "dsTMatrix", "dgRMatrix", "dgCMatrix")
+
+myMatMult <- function(a, b=NULL) {
+     # if one is a vector, then no gpu
+     # if sparse, then no gpu
+     if(is.null(b)) b <- a
+     
+     if(USE_GPU) {
+          dim.a <- dim(a)
+          dim.b <- dim(b)
+          is.vector <- is.null(dim.a) | is.null(dim.b) | 1 %in% dim.a | 1 %in% dim.b
+          if(!is.vector) {
+               classes <- c(class(a), class(b))                              
+               is.sparse.matrix <- all(classes %in% SPARSE_MATRIX_CLASSES)  # sparseVector as well, but not needed here
+          
+               if(!is.sparse.matrix) return(gpuMatMult(a, b))
+          }
+     }
+     
+     a %*% b
+}
+
+myCrossProd <- function(a, b=NULL) {
+     # if one is a vector, then no gpu
+     # if sparse, then no gpu, unless:
+          # if large sparse matrix then gpu; otherwise none
+     if(is.null(b)) b <- a
+     if(USE_GPU) {
+          dim.a <- dim(a)
+          dim.b <- dim(b)
+          is.vector <- is.null(dim.a) | is.null(dim.b) | 1 %in% dim.a | 1 %in% dim.b
+          if(!is.vector) {
+               large.matrix <- any(dims > 1000)
+               
+               classes <- c(class(a), class(b))                              
+               is.sparse.matrix <- all(classes %in% SPARSE_MATRIX_CLASSES)  # sparseVector as well, but not needed here
+               
+               if(!is.sparse.matrix | large.matrix) return(gpuCrossprod(a, b))
+          }
+     }
+     
+     crossprod(a, b)
+}
+
+myTCrossProd <- function(a, b=NULL) {
+     # if one is a vector, then no gpu
+     # if sparse, then no gpu
+     if(is.null(b)) b <- a
+     if(USE_GPU) {
+          dim.a <- dim(a)
+          dim.b <- dim(b)
+          is.vector <- is.null(dim.a) | is.null(dim.b) | 1 %in% dim.a | 1 %in% dim.b
+          if(is.vector) {
+               classes <- c(class(a), class(b))                              
+               is.sparse.matrix <- all(classes %in% SPARSE_MATRIX_CLASSES)  # sparseVector as well, but not needed here
+               
+               if(!is.sparse.matrix) return(gpuTcrossprod(a, b))
+          }
+     }
+     
+     tcrossprod(a, b)
+}
+
+
+# mm <- matrix(runif(3000*2000), 3000, 2000)
+# mmsparse<- matrix(c(1,0,0,0,0,0,0,0), nr=3000, nc=2000)
+# 
+# MM <- Matrix(mm)
+# MMsparse<- Matrix(mmsparse)
+# 
+# 
+# benchmark(
+#      tmp1 <- myMatMult(mmsparse, t(mmsparse)),              
+#      tmp2 <- myMatMult(mm, t(mm)),              
+#      tmp3 <- myMatMult(MM, t(MM)),     
+#      tmp4 <- myMatMult(MMsparse, t(MMsparse)),    
+#      tmp5 <- MM %*% t(MM),
+#      tmp6 <- gpuMatMult(MMsparse, t(MMsparse)),
+#      replications=10
+# )
+# 
+# benchmark(
+#      tmp1 <- myCrossProd(mmsparse),              
+#      tmp2 <- myCrossProd(mm),              
+#      tmp3 <- myCrossProd(MM),     
+#      tmp4 <- myCrossProd(MMsparse),     
+#      replications=10
+# )
+# 
+# benchmark(
+#      tmp1 <- myTCrossProd(mmsparse),              
+#      tmp2 <- myTCrossProd(mm),              
+#      tmp3 <- myTCrossProd(MM),     
+#      tmp4 <- myTCrossProd(MMsparse),     
+#      replications=10
+# )
+
+
+zeros <- function(...) {
+     # modified from matlab to incorporate sparse matrix when possible
+     
+     nargs <- length(dots <- list(...))
+     dims <- as.integer(if (nargs == 1 && matlab:::is.size_t(dots[[1]])) {
+          dots[[1]]
+     } else {
+          unlist(dots)
+     })
+     if (length(dims) == 1) {
+          dims[2] <- dims[1]
+     }
+     if (!(length(dims) > 1)) {
+          stop("dimensions must be of length greater than 1")
+     }
+     else if (!(all(dims > 0))) {
+          stop("dimensions must be a positive quantity")
+     }
+     if(length(dims) == 2) return(Matrix(0, nrow=dims[1], ncol=dims[2], sparse=TRUE))
+     #if(length(dims) == 2) return(matrix(0, nrow=dims[1], ncol=dims[2]))
+     
+     return(array(0, dims))
+}
+
+repmat <- function(A, ...) 
+{
+     nargs <- length(dots <- list(...))
+     dims <- as.integer(if (nargs == 1 && matlab:::is.size_t(dots[[1]])) {
+          dots[[1]]
+     } else {
+          unlist(dots)
+     })
+     if (length(dims) == 1) {
+          dims[2] <- dims[1]
+     }
+     if (!(length(dims) > 1)) {
+          stop("dimensions must be of length greater than 1")
+     }  else if (!(all(dims > 0))) {
+          stop("dimensions must be a positive quantity")
+     }
+     
+     if(is.null(dim(A)) & class(A) == "dsparseVector") A <- Matrix(A, ncol=1, nrow = length(A))
+     
+     B <- switch(EXPR = mode(A), logical = , complex = , numeric = , S4 = {
+          if (all(dims == 1)) {
+               A
+          } else if (dims[length(dims)] == 1) {
+               t(kronecker(array(1, rev(dims)), A))
+          } else {
+               kronecker(array(1, dims), A)
+          }
+     }, character = {
+          fA <- factor(A, levels = unique(A))
+          iA.mat <- Recall(unclass(fA), dims)
+          saved.dim <- dim(iA.mat)
+          cA.mat <- sapply(seq(along = iA.mat), function(i, A, 
+                                                         fac) {
+               A[i] <- levels(fac)[A[i]]
+          }, iA.mat, fA)
+          dim(cA.mat) <- saved.dim
+          cA.mat
+     }, NULL)
+     if (is.null(B)) {
+          stop(sprintf("argument %s must be one of [%s]", sQuote("A"), 
+                       paste(c("numeric", "logical", "complex", "character"), 
+                             collapse = "|")))
+     }
+     return(B)
+}
+
+setMethod(size, signature(X="Matrix", dimen="ANY"), function(X, dimen) if(missing(dimen)) size(as.matrix(X)) else size(as.matrix(X), dimen))
+
+
+
+     
\ No newline at end of file
diff --git a/Neural Net Language Model/Train_Old.R b/Neural Net Language Model/Train_Old.R
new file mode 100644
index 0000000..1fb2d29
--- /dev/null
+++ b/Neural Net Language Model/Train_Old.R	
@@ -0,0 +1,235 @@
+# setwd("~/R_Projects/Neural Networks for Machine Learning")
+library(gputools)
+library(Matrix)
+library(parallel); options(mc.cores=4)
+library(R.utils)
+
+source("MatlabFunctions.R")
+source("Neural Net Language Model/LoadData.R")
+source("Neural Net Language Model/FProp.R")
+
+
+# see also http://mathesaurus.sourceforge.net/octave-r.html
+
+# % This function trains a neural network language model.
+train <- function(epochs) {
+#      % Inputs:
+#      %   epochs: Number of epochs to run.
+#      % Output:
+#      %   model: A struct containing the learned weights and biases and vocabulary.
+     
+     start_time <- proc.time() 
+     
+#      % SET HYPERPARAMETERS HERE.
+     batchsize = 1000  # Mini-batch size; default = 100.
+     learning_rate = 0.1  # Learning rate; default = 0.1.
+     momentum = 0.9  # Momentum; default = 0.9.
+     numhid1 = 50  # Dimensionality of embedding space; default = 50.
+     numhid2 = 200  # Number of units in hidden layer; default = 200.
+     init_wt = 0.01  # Standard deviation of the normal distribution, which is sampled to get the initial weights; default = 0.01
+     
+#      % VARIABLES FOR TRACKING TRAINING PROGRESS.
+     # number is the number of batches to run before showing training & validation
+     show_training_CE_after = 100
+     show_validation_CE_after = 100
+     
+#      % LOAD DATA.
+     data <- load_data(batchsize)
+     tmp <- data$train_input[[1]]
+     numwords <- nrow(tmp)
+     batchsize <- ncol(tmp)
+     numbatches <- length(data$train_input)
+     vocab_size <- length(data$vocab)  
+
+#      % INITIALIZE WEIGHTS AND BIASES.    
+     d1 <- list(word_embedding=vocab_size,
+                embed_to_hid=numwords * numhid1,
+                hid_to_output=numhid2,
+                hid_bias=numhid2,
+                output_bias=vocab_size)
+     d2 <- list(word_embedding=numhid1,
+                embed_to_hid=numhid2,
+                hid_to_output=vocab_size,
+                hid_bias=1,
+                output_bias=1)
+     
+    
+     weights <- c(mapply(function(x, y, init_wt) init_wt*randn(x,y), x=d1[1:3], y=d2[1:3], MoreArgs=list(init_wt=init_wt)),
+                  mapply(sparseVector, length=d1[4:5], MoreArgs=list(x=0), i=0)) # mapply(rep.int, times=d1[4:5], MoreArgs=list(x=0))
+   
+     deltas <- c(mapply(zeros, d1[1:3], d2[1:3]),
+                 mapply(sparseVector, length=d1[4:5], MoreArgs=list(x=0, i=0)))  # mapply(rep.int, times=d1[4:5], MoreArgs=list(x=0))
+     
+     gradients <- list(word_embedding=zeros(d1[1], d2[1]),
+                       embed_to_hid=NULL,
+                       hid_to_output=NULL,
+                       hid_bias=NULL,
+                       output_bias=NULL)
+                   
+     expansion_matrix = .symDiagonal(vocab_size)  # expanding the matrix is faster on a Diagonal class; .symDiagonal faster than just Diagonal here (b/c we are indexing it)
+     #expansion_matrix = eye(vocab_size)
+     count = 0
+     tiny = exp(-30)
+     
+     datasetsize = ncol(data$valid_input)
+     #datasetsize = 
+     expanded_valid_target = expansion_matrix[, as.integer(data$valid_target)]
+     
+
+     
+     
+#      % TRAIN.
+     for(epoch in 1:epochs) {
+          myPrintf('Epoch %d\n', epoch)
+          this_chunk_CE <- 0
+          trainset_CE <- 0
+          
+#           % LOOP OVER MINI-BATCHES.
+          for(m in 1:numbatches) {
+               
+               input_batch <- data$train_input[[m]]
+               target_batch <- data$train_target[[m]]
+#                % FORWARD PROPAGATE.
+#                % Compute the state of each layer in the network given the input batch and all weights and biases
+               # returns the embedding, hidden and output layer states
+               neural_net_states <- fprop(input_batch, weights, repmat)
+               
+#                benchmark(
+#                     neural_net_states <- fprop(input_batch, weights, repmat),
+#                     neural_net_states2 <- fprop(input_batch, weights, myRepMat),
+#                     neural_net_states3 <- fprop(input_batch, weights, myRepMat2),
+#                     neural_net_states3 <- fprop(input_batch, weights, myRepMat3),
+#                     neural_net_states4 <- fprop(input_batch, weights, myRepMat4),
+#                     replications=10
+#                     )
+               
+#                % COMPUTE DERIVATIVE.
+#                %% Expand the target to a sparse 1-of-K vector.
+               expanded_target_batch = expansion_matrix[, as.integer(target_batch)]
+#                %% Compute derivative of cross-entropy loss function.
+               error_deriv = neural_net_states$output_layer_state - expanded_target_batch
+               
+#                % MEASURE LOSS FUNCTION.
+               CE = -sum(colSums(expanded_target_batch * log(neural_net_states$output_layer_state + tiny))) / batchsize
+               count =  count + 1
+               this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count
+               trainset_CE = trainset_CE + (CE - trainset_CE) / m
+               #printf('\rBatch %d Train CE %.3f', m, this_chunk_CE)
+               myPrintf('\rBatch %d Train CE %.3f', m, this_chunk_CE)
+               if (mod(m, show_training_CE_after) == 0) {
+                    myPrintf('\n')
+                    count = 0
+                    this_chunk_CE = 0
+               }
+               
+#                % BACK PROPAGATE.
+#                %% OUTPUT LAYER.
+               # tcrossprod == x %*% t(y)
+               gradients$hid_to_output =  myTCrossProd(neural_net_states$hidden_layer_state, error_deriv)     
+               gradients$output_bias = rowSums(error_deriv)
+               back_propagated_deriv_1 = myMatMult(weights$hid_to_output,error_deriv) * neural_net_states$hidden_layer_state * (1 - neural_net_states$hidden_layer_state)
+               
+#                %% HIDDEN LAYER.
+#                % FILL IN CODE. Replace the line below by one of the options.
+#                gradients$embed_to_hid = zeros(numhid1 * numwords, numhid2);
+#                % Options:
+#                     % (a) embed_to_hid_weights_gradient = t(back_propagated_deriv_1) %*% embedding_layer_state;
+              gradients$embed_to_hid = myTCrossProd(neural_net_states$embedding_layer_state, back_propagated_deriv_1);
+#                % (c) embed_to_hid_weights_gradient = back_propagated_deriv_1;
+#                % (d) embed_to_hid_weights_gradient = embedding_layer_state;
+               
+#                % FILL IN CODE. Replace the line below by one of the options.
+#                gradients$hid_bias = zeros(numhid2, 1);
+#                % Options
+              gradients$hid_bias = rowSums(back_propagated_deriv_1);
+#                % (b) hid_bias_gradient = apply(back_propagated_deriv_1, 2, sum);
+#                % (c) hid_bias_gradient = back_propagated_deriv_1;
+#                % (d) hid_bias_gradient = back_propagated_deriv_1';
+
+#                % FILL IN CODE. Replace the line below by one of the options.
+#                 back_propagated_deriv_2 = zeros(numhid2, batchsize);
+#                % Options
+              back_propagated_deriv_2 = myMatMult(weights$embed_to_hid, back_propagated_deriv_1);
+#                % (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights;
+#                % (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights;
+#                % (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights';
+               
+               #gradients$word_embedding[,] = 0;
+#                %% EMBEDDING LAYER.
+        
+               gradients$word_embedding[,] <- 0
+               for(w in 1:numwords) {
+                    tmp <- back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ]
+                    gradients$word_embedding= gradients$word_embedding + myTCrossProd(expansion_matrix[, as.integer(input_batch[w, ])], tmp)
+               }
+               
+#                system.time({
+#                embeddingFn <- function(w, back_propogated_deriv_2, numhid1, input_batch, expansion_matrix) {
+#                     tmp <- back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ]
+#                     myTCrossProd(expansion_matrix[, input_batch[w, ]], tmp)
+#                }
+#                w <- mclapply(1:numwords, embeddingFn, back_propogated_deriv_2=back_propogated_deriv_2, numhid1=numhid1, input_batch=input_batch, expansion_matrix=expansion_matrix)
+#                for(exMat in w) gradients$word_embedding <- gradients$word_embedding + exMat
+#                })
+               
+#                % UPDATE WEIGHTS AND BIASES.
+               
+               #deltas <- mapply(delta_update_fn, delta=deltas, gradient=gradients, MoreArgs=list(momentum=momentum, batchsize=batchsize))
+               deltas <- mcmapply(delta_update_fn, delta=deltas, gradient=gradients, MoreArgs=list(momentum=momentum, batchsize=batchsize), mc.cores=5)
+               
+               #weights <- mapply(weights_update_fn, delta=deltas, weight=weights, MoreArgs=list(learning_rate=learning_rate))
+               weights <- mcmapply(weights_update_fn, delta=deltas, weight=weights, MoreArgs=list(learning_rate=learning_rate), mc.cores=5)       
+               
+#                % VALIDATE.
+               if(mod(m, show_validation_CE_after) == 0) {
+                    myPrintf('\rRunning validation ...')
+                    neural_net_states <- fprop(data$valid_input, weights, fn=repmat)
+#                     benchmark(
+#                     neural_net_states <- fprop(data$valid_input, weights, fn=repmat),
+#                     neural_net_states2 <- fprop(data$valid_input, weights, fn=myRepMat),
+#                     neural_net_states3 <- fprop(data$valid_input, weights, myRepMat2),
+#                     neural_net_states4 <- fprop(data$valid_input, weights, myRepMat3),
+#                     neural_net_states5 <- fprop(data$valid_input, weights, myRepMat4),
+#                     replications=2
+#                     )
+                   
+                    
+                    CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize
+                    myPrintf(' Validation CE %.3f\n', CE)
+               }
+          }          
+          myPrintf('\rAverage Training CE %.3f\n', trainset_CE)
+     }
+     myPrintf('Finished Training.\n')
+     myPrintf('Final Training CE %.3f\n', trainset_CE)
+     
+#      % EVALUATE ON VALIDATION SET.
+     myPrintf('\rRunning validation ...')
+     
+     neural_net_states <- fprop(data$valid_input, weights, fn=repmat)
+     CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize
+     myPrintf('\rFinal Validation CE %.3f\n', CE)
+    
+#      % EVALUATE ON TEST SET.
+     myPrintf('\rRunning test ...');
+     
+     neural_net_states <- fprop(data$test_input, weights, fn=repmat)
+                               
+     datasetsize = size(data$valid_input, 2);
+     expanded_valid_target = expansion_matrix[, as.integer(data$test_target)];
+     CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) / datasetsize
+     myPrintf('\rFinal Test CE %.3f\n', CE)
+     
+     end_time <- proc.time() 
+     
+     print(end_time - start_time)
+     return(list(weights=weights, vocab=data$vocab))
+}
+
+delta_update_fn <- function(delta, gradient, momentum, batchsize) { momentum * delta + gradient / batchsize}
+weights_update_fn <- function(delta, weight, learning_rate) {weight - learning_rate * delta}
+
+# Rprof()
+# model <- train(1)
+# Rprof(NULL)
+# summaryRprof()