From 1142c7d929aa52405702768c79cffc9b3977b227 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 30 Jan 2013 11:26:14 -0800 Subject: [PATCH] Inclusion of the prior R code for training. (labeled "Old") --- Neural Net Language Model/FProp_Old.R | 112 ++++++ Neural Net Language Model/LoadData_Old.R | 62 ++++ Neural Net Language Model/MatlabFunctions.R | 12 + .../MatlabFunctions_Old.R | 332 ++++++++++++++++++ Neural Net Language Model/Train_Old.R | 235 +++++++++++++ 5 files changed, 753 insertions(+) create mode 100644 Neural Net Language Model/FProp_Old.R create mode 100644 Neural Net Language Model/LoadData_Old.R create mode 100644 Neural Net Language Model/MatlabFunctions.R create mode 100644 Neural Net Language Model/MatlabFunctions_Old.R create mode 100644 Neural Net Language Model/Train_Old.R diff --git a/Neural Net Language Model/FProp_Old.R b/Neural Net Language Model/FProp_Old.R new file mode 100644 index 0000000..31d145e --- /dev/null +++ b/Neural Net Language Model/FProp_Old.R @@ -0,0 +1,112 @@ +# library(Matrix) +#library(matrixStats) + +fprop <- function(input_batch, weights, fn) { +# % This method forward propagates through a neural network. +# % Inputs: +# % input_batch: The input data as a matrix of size numwords X batchsize where, +# % numwords is the number of words, batchsize is the number of data points. +# % So, if input_batch(i, j) = k then the ith word in data point j is word +# % index k of the vocabulary. +# % +# % word_embedding_weights: Word embedding as a matrix of size +# % vocab_size X numhid1, where vocab_size is the size of the vocabulary +# % numhid1 is the dimensionality of the embedding space. +# % +# % embed_to_hid_weights: Weights between the word embedding layer and hidden +# % layer as a matrix of size numhid1*numwords X numhid2, numhid2 is the +# % number of hidden units. +# % +# % hid_to_output_weights: Weights between the hidden layer and output softmax +# % unit as a matrix of size numhid2 X vocab_size +# % +# % hid_bias: Bias of the hidden layer as a matrix of size numhid2 X 1. +# % +# % output_bias: Bias of the output layer as a matrix of size vocab_size X 1. +# % +# % Outputs: +# % embedding_layer_state: State of units in the embedding layer as a matrix of +# % size numhid1*numwords X batchsize +# % +# % hidden_layer_state: State of units in the hidden layer as a matrix of size +# % numhid2 X batchsize +# % +# % output_layer_state: State of units in the output layer as a matrix of size +# % vocab_size X batchsize +# % + + tmp <- dim(input_batch) # basically dim + numwords <- tmp[1] + batchsize <- tmp[2] + + tmp <- dim(weights$word_embedding) # basically dim + vocab_size <- tmp[1] + numhid1 <- tmp[2] + + numhid2 <- ncol(weights$embed_to_hid) + +# %% COMPUTE STATE OF WORD EMBEDDING LAYER. +# % Look up the inputs word indices in the word_embedding_weights matrix. + # each row of the word weights corresponds to a word (250 total) + # input_batch contains 300 total words (3 * 100 batchsize) + # each element of input_batch is a number between 1 and 249 (250?), corresponding to a word + #embedding_layer_state2 <- matrix(as.numeric(weights$word_embedding[as.integer(input_batch), ]), nrow=numhid1 * numwords) + embedding_layer_state <- myReshape(weights$word_embedding[as.integer(input_batch),], nrows=numhid1 * numwords) + + #embedding_layer_state2 <- matlab::reshape(as.matrix(weights$word_embedding[as.numeric(input_batch), ]), numhid1*numwords, 100) + +# %% COMPUTE STATE OF HIDDEN LAYER. +# % Compute inputs to hidden units. + # crossprod = t(x) %*% y + inputs_to_hidden_units = myCrossProd(weights$embed_to_hid, embedding_layer_state) + fn(weights$hid_bias, 1, batchsize) + +# benchmark( +# tmp1 <- repmat(weights$hid_bias, 1, batchsize), +# tmp2 <- myRepMat4(weights$hid_bias, 1, batchsize), +# replications=10 +# ) + + +# % Apply logistic activation function. +# % FILL IN CODE. Replace the line below by one of the options. +# hidden_layer_state = zeros(numhid2, batchsize) +# % Options +# % (a) hidden_layer_state = 1 ./ (1 + exp(inputs_to_hidden_units)); +# % (b) hidden_layer_state = 1 ./ (1 - exp(-inputs_to_hidden_units)); + hidden_layer_state = 1 / (1 + exp(-inputs_to_hidden_units)) +# % (d) hidden_layer_state = -1 ./ (1 + exp(-inputs_to_hidden_units)); + +# %% COMPUTE STATE OF OUTPUT LAYER. +# % Compute inputs to softmax. +# % FILL IN CODE. Replace the line below by one of the options. +# inputs_to_softmax = zeros(vocab_size, batchsize) +# % Options + inputs_to_softmax = myCrossProd(weights$hid_to_output, hidden_layer_state) + fn(weights$output_bias, 1, batchsize) + +# % (b) inputs_to_softmax = t(hid_to_output_weights) %*% hidden_layer_state + repmat(output_bias, batchsize, 1); +# % (c) inputs_to_softmax = hidden_layer_state %*% t(hid_to_output_weights) + repmat(output_bias, 1, batchsize); +# % (d) inputs_to_softmax = hid_to_output_weights %*% hidden_layer_state + repmat(output_bias, batchsize, 1); + +# % Subtract maximum. +# % Remember that adding or subtracting the same constant from each input to a +# % softmax unit does not affect the outputs. Here we are subtracting maximum to +# % make all inputs <= 0. This prevents overflows when computing their +# % exponents. + # max in matlab returns max from each column by default +# benchmark( +# tmp <- apply(inputs_to_softmax, 2, max), +# tmp2 <- colMaxs(inputs_to_softmax), +# replications=10) + tmp <- apply(inputs_to_softmax, 2, max) + inputs_to_softmax = inputs_to_softmax - fn(tmp, vocab_size, 1) + +# % Compute exp. + output_layer_state = exp(inputs_to_softmax) + +# % Normalize to get probability distribution. + output_layer_state = output_layer_state / fn(colSums(output_layer_state), vocab_size, 1) + + return(list(embedding_layer_state=embedding_layer_state, + hidden_layer_state=hidden_layer_state, + output_layer_state=output_layer_state)) +} \ No newline at end of file diff --git a/Neural Net Language Model/LoadData_Old.R b/Neural Net Language Model/LoadData_Old.R new file mode 100644 index 0000000..716e7c4 --- /dev/null +++ b/Neural Net Language Model/LoadData_Old.R @@ -0,0 +1,62 @@ +library(R.matlab) +# library(Matrix) +library(parallel); options(mc.cores = 4) + +load_data <- function(N) { + # % This method loads the training, validation and test set. + # % It also divides the training set into mini-batches. + # % Inputs: + # % N: Mini-batch size. + # % Outputs: + # % train_input: An array of size D X N X M, where + # % D: number of input dimensions (in this case, 3). + # % N: size of each mini-batch (in this case, 100). + # % M: number of minibatches. + # % train_target: An array of size 1 X N X M. + # % valid_input: An array of size D X number of points in the validation set. + # % test: An array of size D X number of points in the test set. + # % vocab: Vocabulary containing index to word mapping. + + data.mat <- readMat("Neural Net Language Model/data.mat") + data <- list(testData = (data.mat$data[1,1,1][[1]]), + trainData = (data.mat$data[2,1,1][[1]]), + validData = (data.mat$data[3,1,1][[1]]), + vocab = unlist(data.mat$data[4,1,1]) + ) + + numdims = nrow(data$trainData) + D = numdims - 1 # subtract 1 because 1:D is the number of input words and D is the predicted word + M = floor(ncol(data$trainData) / N) + + # shift to an list of M minibatches, each with D*N + # looks like we threw out the remainder training data + splitMatrixIntoBatch <- function(dat, b, N, byCol=TRUE) { + # N is the size of each batch + # b is the requested batch + if(length(dim(dat)) == 0) { + if(byCol) dim(dat) <- c(1, length(dat)) else dim(dat) <- c(length(dat), 1) + } + start <- ((b - 1) * N) + 1 + end <- b * N + + if(byCol) return(dat[,start:end]) else return(dat[start:end,]) + } + train_input <- mclapply(1:M, splitMatrixIntoBatch, N=N, dat=data$trainData[1:D,], byCol=TRUE) + train_target <- mclapply(1:M, splitMatrixIntoBatch, N=N, dat=data$trainData[D+1,], byCol=TRUE) + + valid_input <- (data$validData[1:D,]) + valid_target <- data$validData[D + 1,] + + test_input <- (data$testData[1:D,]) + test_target <- data$testData[D + 1,] + + vocab <- data$vocab + + return(list(train_input=train_input, + train_target=train_target, + valid_input=valid_input, + valid_target=valid_target, + test_input=test_input, + test_target=test_target, + vocab=vocab)) +} \ No newline at end of file diff --git a/Neural Net Language Model/MatlabFunctions.R b/Neural Net Language Model/MatlabFunctions.R new file mode 100644 index 0000000..11f3e0c --- /dev/null +++ b/Neural Net Language Model/MatlabFunctions.R @@ -0,0 +1,12 @@ +# Necessary matlab functions + +# Matlab helper functions +# Functions to replicate various matlab commands + +# most are found in the matlab library +library(matlab) + +# HELPER FUNCTIONS +randn <- function(x, y) matrix(data=rnorm(x*y), nrow=x, ncol=y) +myPrintf <- function(txt, ...) writeLines(sprintf(txt, ...), sep="", con=stdout(), useBytes=TRUE) + diff --git a/Neural Net Language Model/MatlabFunctions_Old.R b/Neural Net Language Model/MatlabFunctions_Old.R new file mode 100644 index 0000000..a92f7ab --- /dev/null +++ b/Neural Net Language Model/MatlabFunctions_Old.R @@ -0,0 +1,332 @@ +# Matlab helper functions +# Functions to replicate various matlab commands + +# most are found in the matlab library +library(matlab) +library(Matrix) + +# HELPER FUNCTIONS +randn <- function(x, y) matrix(data=rnorm(x*y), nrow=x, ncol=y) +myPrintf <- function(txt, ...) writeLines(sprintf(txt, ...), sep="", con=stdout(), useBytes=TRUE) +myReshape <- function(mat, nrows, ncols) { + if(missing(ncols)) { + dim(mat) <- c(nrows, floor(length(mat) / nrows)) + } else { + dim(mat) <- c(floor(length(mat) / ncols), ncols) + } + return(mat) +} + +myRepMat <- function(A, nrows, ncols) { + # only works for numeric matrices with set number of rows & columns + stopifnot(nrows > 0 & ncols > 0) + if(is.null(dim(A))) A <- Matrix(A, ncol=1, nrow = length(A)) + + if(missing(ncols)) ncols <- nrows + if(nrows == 1 & ncols == 1) return(A) + if(ncols == 1) { + B <- (t(myMatMult(A, create1Matrix(nrows)))) + } else if(nrows == 1) { + B <- (myMatMult(A, create1Matrix(ncols))) + } else { + #return(kronecker(matrix(1, nrow=nrows, ncol=ncols), A)) + B <- (myMatMult(A, matrix(1, nrow=nrows, ncol=ncols))) + } + #res <- (all.equal(B, repmat(A, nrows, ncols))) + #if(res != TRUE) warning(res) + + return(B) + +} + +# tmp <- A +# benchmark( +# dim(tmp) <- c(length(A), 1), +# tmp <- Matrix(A, ncol=1, nrow=length(A)), +# replications=100 +# ) + +myRepMat2 <- function(A, nrows, ncols) { + # only works for numeric matrices with set number of rows & columns + stopifnot(nrows > 0 & ncols > 0) + if(is.null(dim(A))) A <- Matrix(A, ncol=1, nrow = length(A)) + + if(missing(ncols)) ncols <- nrows + if(nrows == 1 & ncols == 1) return(A) + if(ncols == 1) { + B <- (t(kronecker(create1Matrix(nrows), A))) + } else if(nrows == 1) { + B <- (kronecker(create1Matrix(ncols), A)) + } else { + #return(kronecker(matrix(1, nrow=nrows, ncol=ncols), A)) + B <- (kronecker(matrix(1, nrow=nrows, ncol=ncols), A)) + } + #res <- (all.equal(B, repmat(A, nrows, ncols))) + #if(res != TRUE) warning(res) + + return(B) +} + +myRepMat3 <- function(A, nrows, ncols) { + # only works for numeric matrices with set number of rows & columns + stopifnot(nrows > 0 & ncols > 0) + if(is.null(dim(A))) A <- Matrix(A, ncol=1, nrow = length(A)) + + if(missing(ncols)) ncols <- nrows + if(nrows == 1 & ncols == 1) return(A) + if(ncols == 1) { + B <- (t((A %*% create1Matrix(nrows)))) + } else if(nrows == 1) { + B <- ((A %*% create1Matrix(ncols))) + } else { + #return(kronecker(matrix(1, nrow=nrows, ncol=ncols), A)) + B <- ((A %*% matrix(1, nrow=nrows, ncol=ncols))) + } + #res <- (all.equal(B, repmat(A, nrows, ncols))) + #if(res != TRUE) warning(res) + + return(B) + +} + +myRepMat4 <- function(A, nrows, ncols) { + stopifnot(nrows > 0 & ncols > 0) + + + if(missing(ncols)) ncols <- nrows + if(nrows == 1 & ncols == 1) return(A) + if(ncols == 1) { + B <- Matrix(A, nrow=nrow, ncol=length(A), byrow=FALSE) + + # B <- (t((A %*% create1Matrix(nrows)))) + } else if(nrows == 1) { + B <- Matrix(A, nrow=length(A), ncol=ncols, byrow=TRUE) + # B <- ((A %*% create1Matrix(ncols))) + } else { + #return(kronecker(matrix(1, nrow=nrows, ncol=ncols), A)) + #if(is.null(dim(A))) A <- Matrix(A, ncol=1, nrow = length(A)) + + B <- ((A %*% matrix(1, nrow=nrows, ncol=ncols))) + } + #res <- (all.equal(B, repmat(A, nrows, ncols))) + #if(res != TRUE) warning(res) + + return(B) + + + +} + +create1Matrix <- function(times) { + tmp <- rep.int(1, times=times) + dim(tmp) <- c(1, times) + return(tmp) +} + +benchmark( + tmp <- create1Matrix(1000), + tmp2 <- Matrix(1, 1, 1000), + replications=100 + + ) + + +# benchmark( +# b1 <- matrix(1, nrow=1, ncol=46568), +# b2 <- Matrix(1, nrow=1, ncol=46568), +# b3 <- fn(46568), +# replications=1000) + +# +# benchmark( +# tmp1 <- repmat(A, nrows, ncols), +# tmp2 <- myRepMat(A, nrows, ncols), +# replications = 100 +# +# ) + + +# if("gputools" %in% .packages()) { +# myCrossProd <- gpuCrossprod +# myTCrossProd <- gpuTcrossprod +# myMatMult <- gpuMatMult +# } else { +# myCrossProd <- crossprod +# myTCrossProd <- tcrossprod +# myMatMult <- function(x, y) x %*% y +# } + +USE_GPU <- "gputools" %in% .packages() +SPARSE_MATRIX_CLASSES <- c("dsCMatrix", "ddiMatrix", "dtCMatrix", "dtTMatrix", "dgTMatrix", "dtRMatrix", "dsTMatrix", "dgRMatrix", "dgCMatrix") + +myMatMult <- function(a, b=NULL) { + # if one is a vector, then no gpu + # if sparse, then no gpu + if(is.null(b)) b <- a + + if(USE_GPU) { + dim.a <- dim(a) + dim.b <- dim(b) + is.vector <- is.null(dim.a) | is.null(dim.b) | 1 %in% dim.a | 1 %in% dim.b + if(!is.vector) { + classes <- c(class(a), class(b)) + is.sparse.matrix <- all(classes %in% SPARSE_MATRIX_CLASSES) # sparseVector as well, but not needed here + + if(!is.sparse.matrix) return(gpuMatMult(a, b)) + } + } + + a %*% b +} + +myCrossProd <- function(a, b=NULL) { + # if one is a vector, then no gpu + # if sparse, then no gpu, unless: + # if large sparse matrix then gpu; otherwise none + if(is.null(b)) b <- a + if(USE_GPU) { + dim.a <- dim(a) + dim.b <- dim(b) + is.vector <- is.null(dim.a) | is.null(dim.b) | 1 %in% dim.a | 1 %in% dim.b + if(!is.vector) { + large.matrix <- any(dims > 1000) + + classes <- c(class(a), class(b)) + is.sparse.matrix <- all(classes %in% SPARSE_MATRIX_CLASSES) # sparseVector as well, but not needed here + + if(!is.sparse.matrix | large.matrix) return(gpuCrossprod(a, b)) + } + } + + crossprod(a, b) +} + +myTCrossProd <- function(a, b=NULL) { + # if one is a vector, then no gpu + # if sparse, then no gpu + if(is.null(b)) b <- a + if(USE_GPU) { + dim.a <- dim(a) + dim.b <- dim(b) + is.vector <- is.null(dim.a) | is.null(dim.b) | 1 %in% dim.a | 1 %in% dim.b + if(is.vector) { + classes <- c(class(a), class(b)) + is.sparse.matrix <- all(classes %in% SPARSE_MATRIX_CLASSES) # sparseVector as well, but not needed here + + if(!is.sparse.matrix) return(gpuTcrossprod(a, b)) + } + } + + tcrossprod(a, b) +} + + +# mm <- matrix(runif(3000*2000), 3000, 2000) +# mmsparse<- matrix(c(1,0,0,0,0,0,0,0), nr=3000, nc=2000) +# +# MM <- Matrix(mm) +# MMsparse<- Matrix(mmsparse) +# +# +# benchmark( +# tmp1 <- myMatMult(mmsparse, t(mmsparse)), +# tmp2 <- myMatMult(mm, t(mm)), +# tmp3 <- myMatMult(MM, t(MM)), +# tmp4 <- myMatMult(MMsparse, t(MMsparse)), +# tmp5 <- MM %*% t(MM), +# tmp6 <- gpuMatMult(MMsparse, t(MMsparse)), +# replications=10 +# ) +# +# benchmark( +# tmp1 <- myCrossProd(mmsparse), +# tmp2 <- myCrossProd(mm), +# tmp3 <- myCrossProd(MM), +# tmp4 <- myCrossProd(MMsparse), +# replications=10 +# ) +# +# benchmark( +# tmp1 <- myTCrossProd(mmsparse), +# tmp2 <- myTCrossProd(mm), +# tmp3 <- myTCrossProd(MM), +# tmp4 <- myTCrossProd(MMsparse), +# replications=10 +# ) + + +zeros <- function(...) { + # modified from matlab to incorporate sparse matrix when possible + + nargs <- length(dots <- list(...)) + dims <- as.integer(if (nargs == 1 && matlab:::is.size_t(dots[[1]])) { + dots[[1]] + } else { + unlist(dots) + }) + if (length(dims) == 1) { + dims[2] <- dims[1] + } + if (!(length(dims) > 1)) { + stop("dimensions must be of length greater than 1") + } + else if (!(all(dims > 0))) { + stop("dimensions must be a positive quantity") + } + if(length(dims) == 2) return(Matrix(0, nrow=dims[1], ncol=dims[2], sparse=TRUE)) + #if(length(dims) == 2) return(matrix(0, nrow=dims[1], ncol=dims[2])) + + return(array(0, dims)) +} + +repmat <- function(A, ...) +{ + nargs <- length(dots <- list(...)) + dims <- as.integer(if (nargs == 1 && matlab:::is.size_t(dots[[1]])) { + dots[[1]] + } else { + unlist(dots) + }) + if (length(dims) == 1) { + dims[2] <- dims[1] + } + if (!(length(dims) > 1)) { + stop("dimensions must be of length greater than 1") + } else if (!(all(dims > 0))) { + stop("dimensions must be a positive quantity") + } + + if(is.null(dim(A)) & class(A) == "dsparseVector") A <- Matrix(A, ncol=1, nrow = length(A)) + + B <- switch(EXPR = mode(A), logical = , complex = , numeric = , S4 = { + if (all(dims == 1)) { + A + } else if (dims[length(dims)] == 1) { + t(kronecker(array(1, rev(dims)), A)) + } else { + kronecker(array(1, dims), A) + } + }, character = { + fA <- factor(A, levels = unique(A)) + iA.mat <- Recall(unclass(fA), dims) + saved.dim <- dim(iA.mat) + cA.mat <- sapply(seq(along = iA.mat), function(i, A, + fac) { + A[i] <- levels(fac)[A[i]] + }, iA.mat, fA) + dim(cA.mat) <- saved.dim + cA.mat + }, NULL) + if (is.null(B)) { + stop(sprintf("argument %s must be one of [%s]", sQuote("A"), + paste(c("numeric", "logical", "complex", "character"), + collapse = "|"))) + } + return(B) +} + +setMethod(size, signature(X="Matrix", dimen="ANY"), function(X, dimen) if(missing(dimen)) size(as.matrix(X)) else size(as.matrix(X), dimen)) + + + + \ No newline at end of file diff --git a/Neural Net Language Model/Train_Old.R b/Neural Net Language Model/Train_Old.R new file mode 100644 index 0000000..1fb2d29 --- /dev/null +++ b/Neural Net Language Model/Train_Old.R @@ -0,0 +1,235 @@ +# setwd("~/R_Projects/Neural Networks for Machine Learning") +library(gputools) +library(Matrix) +library(parallel); options(mc.cores=4) +library(R.utils) + +source("MatlabFunctions.R") +source("Neural Net Language Model/LoadData.R") +source("Neural Net Language Model/FProp.R") + + +# see also http://mathesaurus.sourceforge.net/octave-r.html + +# % This function trains a neural network language model. +train <- function(epochs) { +# % Inputs: +# % epochs: Number of epochs to run. +# % Output: +# % model: A struct containing the learned weights and biases and vocabulary. + + start_time <- proc.time() + +# % SET HYPERPARAMETERS HERE. + batchsize = 1000 # Mini-batch size; default = 100. + learning_rate = 0.1 # Learning rate; default = 0.1. + momentum = 0.9 # Momentum; default = 0.9. + numhid1 = 50 # Dimensionality of embedding space; default = 50. + numhid2 = 200 # Number of units in hidden layer; default = 200. + init_wt = 0.01 # Standard deviation of the normal distribution, which is sampled to get the initial weights; default = 0.01 + +# % VARIABLES FOR TRACKING TRAINING PROGRESS. + # number is the number of batches to run before showing training & validation + show_training_CE_after = 100 + show_validation_CE_after = 100 + +# % LOAD DATA. + data <- load_data(batchsize) + tmp <- data$train_input[[1]] + numwords <- nrow(tmp) + batchsize <- ncol(tmp) + numbatches <- length(data$train_input) + vocab_size <- length(data$vocab) + +# % INITIALIZE WEIGHTS AND BIASES. + d1 <- list(word_embedding=vocab_size, + embed_to_hid=numwords * numhid1, + hid_to_output=numhid2, + hid_bias=numhid2, + output_bias=vocab_size) + d2 <- list(word_embedding=numhid1, + embed_to_hid=numhid2, + hid_to_output=vocab_size, + hid_bias=1, + output_bias=1) + + + weights <- c(mapply(function(x, y, init_wt) init_wt*randn(x,y), x=d1[1:3], y=d2[1:3], MoreArgs=list(init_wt=init_wt)), + mapply(sparseVector, length=d1[4:5], MoreArgs=list(x=0), i=0)) # mapply(rep.int, times=d1[4:5], MoreArgs=list(x=0)) + + deltas <- c(mapply(zeros, d1[1:3], d2[1:3]), + mapply(sparseVector, length=d1[4:5], MoreArgs=list(x=0, i=0))) # mapply(rep.int, times=d1[4:5], MoreArgs=list(x=0)) + + gradients <- list(word_embedding=zeros(d1[1], d2[1]), + embed_to_hid=NULL, + hid_to_output=NULL, + hid_bias=NULL, + output_bias=NULL) + + expansion_matrix = .symDiagonal(vocab_size) # expanding the matrix is faster on a Diagonal class; .symDiagonal faster than just Diagonal here (b/c we are indexing it) + #expansion_matrix = eye(vocab_size) + count = 0 + tiny = exp(-30) + + datasetsize = ncol(data$valid_input) + #datasetsize = + expanded_valid_target = expansion_matrix[, as.integer(data$valid_target)] + + + + +# % TRAIN. + for(epoch in 1:epochs) { + myPrintf('Epoch %d\n', epoch) + this_chunk_CE <- 0 + trainset_CE <- 0 + +# % LOOP OVER MINI-BATCHES. + for(m in 1:numbatches) { + + input_batch <- data$train_input[[m]] + target_batch <- data$train_target[[m]] +# % FORWARD PROPAGATE. +# % Compute the state of each layer in the network given the input batch and all weights and biases + # returns the embedding, hidden and output layer states + neural_net_states <- fprop(input_batch, weights, repmat) + +# benchmark( +# neural_net_states <- fprop(input_batch, weights, repmat), +# neural_net_states2 <- fprop(input_batch, weights, myRepMat), +# neural_net_states3 <- fprop(input_batch, weights, myRepMat2), +# neural_net_states3 <- fprop(input_batch, weights, myRepMat3), +# neural_net_states4 <- fprop(input_batch, weights, myRepMat4), +# replications=10 +# ) + +# % COMPUTE DERIVATIVE. +# %% Expand the target to a sparse 1-of-K vector. + expanded_target_batch = expansion_matrix[, as.integer(target_batch)] +# %% Compute derivative of cross-entropy loss function. + error_deriv = neural_net_states$output_layer_state - expanded_target_batch + +# % MEASURE LOSS FUNCTION. + CE = -sum(colSums(expanded_target_batch * log(neural_net_states$output_layer_state + tiny))) / batchsize + count = count + 1 + this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count + trainset_CE = trainset_CE + (CE - trainset_CE) / m + #printf('\rBatch %d Train CE %.3f', m, this_chunk_CE) + myPrintf('\rBatch %d Train CE %.3f', m, this_chunk_CE) + if (mod(m, show_training_CE_after) == 0) { + myPrintf('\n') + count = 0 + this_chunk_CE = 0 + } + +# % BACK PROPAGATE. +# %% OUTPUT LAYER. + # tcrossprod == x %*% t(y) + gradients$hid_to_output = myTCrossProd(neural_net_states$hidden_layer_state, error_deriv) + gradients$output_bias = rowSums(error_deriv) + back_propagated_deriv_1 = myMatMult(weights$hid_to_output,error_deriv) * neural_net_states$hidden_layer_state * (1 - neural_net_states$hidden_layer_state) + +# %% HIDDEN LAYER. +# % FILL IN CODE. Replace the line below by one of the options. +# gradients$embed_to_hid = zeros(numhid1 * numwords, numhid2); +# % Options: +# % (a) embed_to_hid_weights_gradient = t(back_propagated_deriv_1) %*% embedding_layer_state; + gradients$embed_to_hid = myTCrossProd(neural_net_states$embedding_layer_state, back_propagated_deriv_1); +# % (c) embed_to_hid_weights_gradient = back_propagated_deriv_1; +# % (d) embed_to_hid_weights_gradient = embedding_layer_state; + +# % FILL IN CODE. Replace the line below by one of the options. +# gradients$hid_bias = zeros(numhid2, 1); +# % Options + gradients$hid_bias = rowSums(back_propagated_deriv_1); +# % (b) hid_bias_gradient = apply(back_propagated_deriv_1, 2, sum); +# % (c) hid_bias_gradient = back_propagated_deriv_1; +# % (d) hid_bias_gradient = back_propagated_deriv_1'; + +# % FILL IN CODE. Replace the line below by one of the options. +# back_propagated_deriv_2 = zeros(numhid2, batchsize); +# % Options + back_propagated_deriv_2 = myMatMult(weights$embed_to_hid, back_propagated_deriv_1); +# % (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights; +# % (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights; +# % (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights'; + + #gradients$word_embedding[,] = 0; +# %% EMBEDDING LAYER. + + gradients$word_embedding[,] <- 0 + for(w in 1:numwords) { + tmp <- back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ] + gradients$word_embedding= gradients$word_embedding + myTCrossProd(expansion_matrix[, as.integer(input_batch[w, ])], tmp) + } + +# system.time({ +# embeddingFn <- function(w, back_propogated_deriv_2, numhid1, input_batch, expansion_matrix) { +# tmp <- back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ] +# myTCrossProd(expansion_matrix[, input_batch[w, ]], tmp) +# } +# w <- mclapply(1:numwords, embeddingFn, back_propogated_deriv_2=back_propogated_deriv_2, numhid1=numhid1, input_batch=input_batch, expansion_matrix=expansion_matrix) +# for(exMat in w) gradients$word_embedding <- gradients$word_embedding + exMat +# }) + +# % UPDATE WEIGHTS AND BIASES. + + #deltas <- mapply(delta_update_fn, delta=deltas, gradient=gradients, MoreArgs=list(momentum=momentum, batchsize=batchsize)) + deltas <- mcmapply(delta_update_fn, delta=deltas, gradient=gradients, MoreArgs=list(momentum=momentum, batchsize=batchsize), mc.cores=5) + + #weights <- mapply(weights_update_fn, delta=deltas, weight=weights, MoreArgs=list(learning_rate=learning_rate)) + weights <- mcmapply(weights_update_fn, delta=deltas, weight=weights, MoreArgs=list(learning_rate=learning_rate), mc.cores=5) + +# % VALIDATE. + if(mod(m, show_validation_CE_after) == 0) { + myPrintf('\rRunning validation ...') + neural_net_states <- fprop(data$valid_input, weights, fn=repmat) +# benchmark( +# neural_net_states <- fprop(data$valid_input, weights, fn=repmat), +# neural_net_states2 <- fprop(data$valid_input, weights, fn=myRepMat), +# neural_net_states3 <- fprop(data$valid_input, weights, myRepMat2), +# neural_net_states4 <- fprop(data$valid_input, weights, myRepMat3), +# neural_net_states5 <- fprop(data$valid_input, weights, myRepMat4), +# replications=2 +# ) + + + CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize + myPrintf(' Validation CE %.3f\n', CE) + } + } + myPrintf('\rAverage Training CE %.3f\n', trainset_CE) + } + myPrintf('Finished Training.\n') + myPrintf('Final Training CE %.3f\n', trainset_CE) + +# % EVALUATE ON VALIDATION SET. + myPrintf('\rRunning validation ...') + + neural_net_states <- fprop(data$valid_input, weights, fn=repmat) + CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize + myPrintf('\rFinal Validation CE %.3f\n', CE) + +# % EVALUATE ON TEST SET. + myPrintf('\rRunning test ...'); + + neural_net_states <- fprop(data$test_input, weights, fn=repmat) + + datasetsize = size(data$valid_input, 2); + expanded_valid_target = expansion_matrix[, as.integer(data$test_target)]; + CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) / datasetsize + myPrintf('\rFinal Test CE %.3f\n', CE) + + end_time <- proc.time() + + print(end_time - start_time) + return(list(weights=weights, vocab=data$vocab)) +} + +delta_update_fn <- function(delta, gradient, momentum, batchsize) { momentum * delta + gradient / batchsize} +weights_update_fn <- function(delta, weight, learning_rate) {weight - learning_rate * delta} + +# Rprof() +# model <- train(1) +# Rprof(NULL) +# summaryRprof()