diff --git a/Neural Net Language Model/FProp.R b/Neural Net Language Model/FProp.R index 31d145e..7186a24 100644 --- a/Neural Net Language Model/FProp.R +++ b/Neural Net Language Model/FProp.R @@ -1,110 +1,123 @@ +# Original FProp file, with minor modifications to accomodate R + # library(Matrix) #library(matrixStats) +library(R.matlab) +library(matrixStats) -fprop <- function(input_batch, weights, fn) { -# % This method forward propagates through a neural network. -# % Inputs: -# % input_batch: The input data as a matrix of size numwords X batchsize where, -# % numwords is the number of words, batchsize is the number of data points. -# % So, if input_batch(i, j) = k then the ith word in data point j is word -# % index k of the vocabulary. -# % -# % word_embedding_weights: Word embedding as a matrix of size -# % vocab_size X numhid1, where vocab_size is the size of the vocabulary -# % numhid1 is the dimensionality of the embedding space. -# % -# % embed_to_hid_weights: Weights between the word embedding layer and hidden -# % layer as a matrix of size numhid1*numwords X numhid2, numhid2 is the -# % number of hidden units. -# % -# % hid_to_output_weights: Weights between the hidden layer and output softmax -# % unit as a matrix of size numhid2 X vocab_size -# % -# % hid_bias: Bias of the hidden layer as a matrix of size numhid2 X 1. -# % -# % output_bias: Bias of the output layer as a matrix of size vocab_size X 1. -# % -# % Outputs: -# % embedding_layer_state: State of units in the embedding layer as a matrix of -# % size numhid1*numwords X batchsize -# % -# % hidden_layer_state: State of units in the hidden layer as a matrix of size -# % numhid2 X batchsize -# % -# % output_layer_state: State of units in the output layer as a matrix of size -# % vocab_size X batchsize -# % - - tmp <- dim(input_batch) # basically dim +fprop <- function(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) { + # % This method forward propagates through a neural network. + # % Inputs: + # % input_batch: The input data as a matrix of size numwords X batchsize where, + # % numwords is the number of words, batchsize is the number of data points. + # % So, if input_batch(i, j) = k then the ith word in data point j is word + # % index k of the vocabulary. + # % + # % word_embedding_weights: Word embedding as a matrix of size + # % vocab_size X numhid1, where vocab_size is the size of the vocabulary + # % numhid1 is the dimensionality of the embedding space. + # % + # % embed_to_hid_weights: Weights between the word embedding layer and hidden + # % layer as a matrix of size numhid1*numwords X numhid2, numhid2 is the + # % number of hidden units. + # % + # % hid_to_output_weights: Weights between the hidden layer and output softmax + # % unit as a matrix of size numhid2 X vocab_size + # % + # % hid_bias: Bias of the hidden layer as a matrix of size numhid2 X 1. + # % + # % output_bias: Bias of the output layer as a matrix of size vocab_size X 1. + # % + # % Outputs: + # % embedding_layer_state: State of units in the embedding layer as a matrix of + # % size numhid1*numwords X batchsize + # % + # % hidden_layer_state: State of units in the hidden layer as a matrix of size + # % numhid2 X batchsize + # % + # % output_layer_state: State of units in the output layer as a matrix of size + # % vocab_size X batchsize + # % + + tmp <- size(input_batch) # basically dim numwords <- tmp[1] batchsize <- tmp[2] - tmp <- dim(weights$word_embedding) # basically dim + tmp <- size(word_embedding_weights) # basically dim vocab_size <- tmp[1] numhid1 <- tmp[2] + numhid2 <- size(embed_to_hid_weights, 2) - numhid2 <- ncol(weights$embed_to_hid) - -# %% COMPUTE STATE OF WORD EMBEDDING LAYER. -# % Look up the inputs word indices in the word_embedding_weights matrix. + # %% COMPUTE STATE OF WORD EMBEDDING LAYER. + # % Look up the inputs word indices in the word_embedding_weights matrix. # each row of the word weights corresponds to a word (250 total) # input_batch contains 300 total words (3 * 100 batchsize) # each element of input_batch is a number between 1 and 249 (250?), corresponding to a word #embedding_layer_state2 <- matrix(as.numeric(weights$word_embedding[as.integer(input_batch), ]), nrow=numhid1 * numwords) - embedding_layer_state <- myReshape(weights$word_embedding[as.integer(input_batch),], nrows=numhid1 * numwords) + #embedding_layer_state <- myReshape(weights$word_embedding[as.integer(input_batch),], nrows=numhid1 * numwords) + #embedding_layer_state <- reshape(t(word_embedding_weights[reshape(input_batch, 1, []), ]), numhid1 * numwords, []) + + # [] is allowed in reshape: one dimension remains unspecified and Octave will determine it automatically + tmp <- t(word_embedding_weights[reshape(input_batch, 1, length(input_batch)), ]) + embedding_layer_state <- reshape(tmp, numhid1 * numwords, length(tmp) / (numhid1 * numwords)) #embedding_layer_state2 <- matlab::reshape(as.matrix(weights$word_embedding[as.numeric(input_batch), ]), numhid1*numwords, 100) - -# %% COMPUTE STATE OF HIDDEN LAYER. -# % Compute inputs to hidden units. + + # %% COMPUTE STATE OF HIDDEN LAYER. + # % Compute inputs to hidden units. # crossprod = t(x) %*% y - inputs_to_hidden_units = myCrossProd(weights$embed_to_hid, embedding_layer_state) + fn(weights$hid_bias, 1, batchsize) - -# benchmark( -# tmp1 <- repmat(weights$hid_bias, 1, batchsize), -# tmp2 <- myRepMat4(weights$hid_bias, 1, batchsize), -# replications=10 -# ) - - -# % Apply logistic activation function. -# % FILL IN CODE. Replace the line below by one of the options. -# hidden_layer_state = zeros(numhid2, batchsize) -# % Options -# % (a) hidden_layer_state = 1 ./ (1 + exp(inputs_to_hidden_units)); -# % (b) hidden_layer_state = 1 ./ (1 - exp(-inputs_to_hidden_units)); - hidden_layer_state = 1 / (1 + exp(-inputs_to_hidden_units)) -# % (d) hidden_layer_state = -1 ./ (1 + exp(-inputs_to_hidden_units)); - -# %% COMPUTE STATE OF OUTPUT LAYER. -# % Compute inputs to softmax. -# % FILL IN CODE. Replace the line below by one of the options. -# inputs_to_softmax = zeros(vocab_size, batchsize) -# % Options - inputs_to_softmax = myCrossProd(weights$hid_to_output, hidden_layer_state) + fn(weights$output_bias, 1, batchsize) - -# % (b) inputs_to_softmax = t(hid_to_output_weights) %*% hidden_layer_state + repmat(output_bias, batchsize, 1); -# % (c) inputs_to_softmax = hidden_layer_state %*% t(hid_to_output_weights) + repmat(output_bias, 1, batchsize); -# % (d) inputs_to_softmax = hid_to_output_weights %*% hidden_layer_state + repmat(output_bias, batchsize, 1); - -# % Subtract maximum. -# % Remember that adding or subtracting the same constant from each input to a -# % softmax unit does not affect the outputs. Here we are subtracting maximum to -# % make all inputs <= 0. This prevents overflows when computing their -# % exponents. + #inputs_to_hidden_units = myCrossProd(weights$embed_to_hid, embedding_layer_state) + fn(weights$hid_bias, 1, batchsize) + inputs_to_hidden_units <- t(embed_to_hid_weights) %*% embedding_layer_state + repmat(hid_bias, 1, batchsize) + + + # benchmark( + # tmp1 <- repmat(weights$hid_bias, 1, batchsize), + # tmp2 <- myRepMat4(weights$hid_bias, 1, batchsize), + # replications=10 + # ) + + + # % Apply logistic activation function. + # % FILL IN CODE. Replace the line below by one of the options. + # hidden_layer_state = zeros(numhid2, batchsize) + # % Options + # % (a) hidden_layer_state = 1 ./ (1 + exp(inputs_to_hidden_units)); + # % (b) hidden_layer_state = 1 ./ (1 - exp(-inputs_to_hidden_units)); + hidden_layer_state = 1 / (1 + exp(-inputs_to_hidden_units)) + # % (d) hidden_layer_state = -1 ./ (1 + exp(-inputs_to_hidden_units)); + + # %% COMPUTE STATE OF OUTPUT LAYER. + # % Compute inputs to softmax. + # % FILL IN CODE. Replace the line below by one of the options. + # inputs_to_softmax = zeros(vocab_size, batchsize) + # % Options + #inputs_to_softmax = myCrossProd(weights$hid_to_output, hidden_layer_state) + fn(weights$output_bias, 1, batchsize) + inputs_to_softmax <- t(hid_to_output_weights) %*% hidden_layer_state + repmat(output_bias, 1, batchsize) + + # % (b) inputs_to_softmax = t(hid_to_output_weights) %*% hidden_layer_state + repmat(output_bias, batchsize, 1); + # % (c) inputs_to_softmax = hidden_layer_state %*% t(hid_to_output_weights) + repmat(output_bias, 1, batchsize); + # % (d) inputs_to_softmax = hid_to_output_weights %*% hidden_layer_state + repmat(output_bias, batchsize, 1); + + # % Subtract maximum. + # % Remember that adding or subtracting the same constant from each input to a + # % softmax unit does not affect the outputs. Here we are subtracting maximum to + # % make all inputs <= 0. This prevents overflows when computing their + # % exponents. # max in matlab returns max from each column by default -# benchmark( -# tmp <- apply(inputs_to_softmax, 2, max), -# tmp2 <- colMaxs(inputs_to_softmax), -# replications=10) - tmp <- apply(inputs_to_softmax, 2, max) - inputs_to_softmax = inputs_to_softmax - fn(tmp, vocab_size, 1) - -# % Compute exp. + # benchmark( + # tmp <- apply(inputs_to_softmax, 2, max), + # tmp2 <- colMaxs(inputs_to_softmax), + # replications=10) + #tmp <- apply(inputs_to_softmax, 2, max) + #inputs_to_softmax = inputs_to_softmax - fn(tmp, vocab_size, 1) + inputs_to_softmax <- inputs_to_softmax - repmat(colMaxs(inputs_to_softmax), vocab_size, 1) + + # % Compute exp. output_layer_state = exp(inputs_to_softmax) -# % Normalize to get probability distribution. - output_layer_state = output_layer_state / fn(colSums(output_layer_state), vocab_size, 1) + # % Normalize to get probability distribution. + #output_layer_state = output_layer_state / fn(colSums(output_layer_state), vocab_size, 1) + output_layer_state <- output_layer_state / repmat(matlab::sum(output_layer_state), vocab_size, 1) return(list(embedding_layer_state=embedding_layer_state, hidden_layer_state=hidden_layer_state, diff --git a/Neural Net Language Model/FProp_Original.R b/Neural Net Language Model/FProp_Original.R index 7186a24..794c8d5 100644 --- a/Neural Net Language Model/FProp_Original.R +++ b/Neural Net Language Model/FProp_Original.R @@ -5,7 +5,7 @@ library(R.matlab) library(matrixStats) -fprop <- function(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) { +fprop_original <- function(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) { # % This method forward propagates through a neural network. # % Inputs: # % input_batch: The input data as a matrix of size numwords X batchsize where, diff --git a/Neural Net Language Model/LoadData.R b/Neural Net Language Model/LoadData.R index 716e7c4..52482de 100644 --- a/Neural Net Language Model/LoadData.R +++ b/Neural Net Language Model/LoadData.R @@ -1,6 +1,9 @@ +# Load Data Original, with minor changes to accomodate R + library(R.matlab) # library(Matrix) library(parallel); options(mc.cores = 4) +library(itertools) load_data <- function(N) { # % This method loads the training, validation and test set. @@ -24,32 +27,18 @@ load_data <- function(N) { vocab = unlist(data.mat$data[4,1,1]) ) - numdims = nrow(data$trainData) + numdims = size(data$trainData, 1) D = numdims - 1 # subtract 1 because 1:D is the number of input words and D is the predicted word - M = floor(ncol(data$trainData) / N) + M = floor(size(data$trainData, 2) / N) # shift to an list of M minibatches, each with D*N # looks like we threw out the remainder training data - splitMatrixIntoBatch <- function(dat, b, N, byCol=TRUE) { - # N is the size of each batch - # b is the requested batch - if(length(dim(dat)) == 0) { - if(byCol) dim(dat) <- c(1, length(dat)) else dim(dat) <- c(length(dat), 1) - } - start <- ((b - 1) * N) + 1 - end <- b * N - - if(byCol) return(dat[,start:end]) else return(dat[start:end,]) - } - train_input <- mclapply(1:M, splitMatrixIntoBatch, N=N, dat=data$trainData[1:D,], byCol=TRUE) - train_target <- mclapply(1:M, splitMatrixIntoBatch, N=N, dat=data$trainData[D+1,], byCol=TRUE) - - valid_input <- (data$validData[1:D,]) - valid_target <- data$validData[D + 1,] - - test_input <- (data$testData[1:D,]) - test_target <- data$testData[D + 1,] - + train_input <- data$trainData[1:D, 1:(N*M), drop=F] + train_target <- data$trainData[D + 1, 1:(N*M), drop=F] + valid_input <- data$validData[1:D,, drop=F] + valid_target <- data$validData[D + 1, , drop=F] + test_input <- data$validData[1:D, , drop=F] + test_target <- data$testData[D + 1, , drop=F] vocab <- data$vocab return(list(train_input=train_input, @@ -59,4 +48,70 @@ load_data <- function(N) { test_input=test_input, test_target=test_target, vocab=vocab)) -} \ No newline at end of file +} + +# faster to access lists than indices in an array +# but slower to create a set of lists than to reshape the training data into an array + + + +# train_input3 <- vector(M, "list") +# fn <- function() { +# it <- isplitCols(data$trainData[1:D, 1:(N*M)], chunks = M) +# replicate(M, nextElem(it), simplify=F) +# } +# +# fn2 <- function() { +# enumerate(isplitCols(data$trainData[1:D, 1:(N*M)], chunks = M)) +# } +# +# myReshape <- function(A, ...) { +# if (!is.array(A)) { +# stop(sprintf("argument %s must be matrix or array", sQuote("A"))) +# } +# nargs <- length(dots <- list(...)) +# dims <- as.integer(if (nargs == 1 && matlab:::is.size_t(dots[[1]])) { +# dots[[1]] +# } else { +# unlist(dots) +# }) +# if (!(length(dims) > 1)) { +# stop("dimensions must be of length greater than 1") +# } +# else if (!(all(dims > 0))) { +# stop("dimensions must be a positive quantity") +# } +# else if (prod(dims) != prod(dim(A))) { +# stop("number of elements must not change") +# } +# dim(A) <- dims +# +# return(A) +# +# } +# +# +# +# +# benchmark( +# train_input1 <- reshape(data$trainData[1:D, 1:(N*M), drop=F], D, N, M), +# train_input2 <- lapply(1:M, splitMatrixIntoBatch, N=N, dat=data$trainData[1:D,], byCol=TRUE), +# train_input3 <- fn(), +# train_input4 <- myReshape(data$trainData[1:D, 1:(N*M), drop=F], D, N, M), # close second +# train_input5 <- fn2(), # fastest +# replications=10 +# ) +# # +# +# benchmark( +# data1 <- load_data_original(batchsize), +# data2 <- load_data(batchsize), +# +# replications <- 10 +# ) +# +# benchmark( +# for(m in 1:372) tmp1 <- data1$train_input[,,m], +# for(m in 1:372) tmp2 <- data2$train_input[[m]], +# replications <- 2 +# ) diff --git a/Neural Net Language Model/LoadData_Original.R b/Neural Net Language Model/LoadData_Original.R index f732903..4eee335 100644 --- a/Neural Net Language Model/LoadData_Original.R +++ b/Neural Net Language Model/LoadData_Original.R @@ -4,7 +4,7 @@ library(R.matlab) # library(Matrix) #library(parallel); options(mc.cores = 4) -load_data <- function(N) { +load_data_original <- function(N) { # % This method loads the training, validation and test set. # % It also divides the training set into mini-batches. # % Inputs: diff --git a/Neural Net Language Model/Train.R b/Neural Net Language Model/Train.R index 1fb2d29..7d3f52a 100644 --- a/Neural Net Language Model/Train.R +++ b/Neural Net Language Model/Train.R @@ -1,10 +1,12 @@ +# Original training file, with minor modifications to run in R + # setwd("~/R_Projects/Neural Networks for Machine Learning") -library(gputools) -library(Matrix) -library(parallel); options(mc.cores=4) -library(R.utils) +#library(gputools) +#library(Matrix) +#library(parallel); options(mc.cores=4) +# library(R.utils) -source("MatlabFunctions.R") +source("Neural Net Language Model/MatlabFunctions.R") source("Neural Net Language Model/LoadData.R") source("Neural Net Language Model/FProp.R") @@ -13,14 +15,14 @@ source("Neural Net Language Model/FProp.R") # % This function trains a neural network language model. train <- function(epochs) { -# % Inputs: -# % epochs: Number of epochs to run. -# % Output: -# % model: A struct containing the learned weights and biases and vocabulary. + # % Inputs: + # % epochs: Number of epochs to run. + # % Output: + # % model: A struct containing the learned weights and biases and vocabulary. start_time <- proc.time() -# % SET HYPERPARAMETERS HERE. + # % SET HYPERPARAMETERS HERE. batchsize = 1000 # Mini-batch size; default = 100. learning_rate = 0.1 # Learning rate; default = 0.1. momentum = 0.9 # Momentum; default = 0.9. @@ -28,89 +30,76 @@ train <- function(epochs) { numhid2 = 200 # Number of units in hidden layer; default = 200. init_wt = 0.01 # Standard deviation of the normal distribution, which is sampled to get the initial weights; default = 0.01 -# % VARIABLES FOR TRACKING TRAINING PROGRESS. + # % VARIABLES FOR TRACKING TRAINING PROGRESS. # number is the number of batches to run before showing training & validation show_training_CE_after = 100 - show_validation_CE_after = 100 + show_validation_CE_after = 1000 -# % LOAD DATA. + # % LOAD DATA. data <- load_data(batchsize) - tmp <- data$train_input[[1]] - numwords <- nrow(tmp) - batchsize <- ncol(tmp) - numbatches <- length(data$train_input) - vocab_size <- length(data$vocab) - -# % INITIALIZE WEIGHTS AND BIASES. - d1 <- list(word_embedding=vocab_size, - embed_to_hid=numwords * numhid1, - hid_to_output=numhid2, - hid_bias=numhid2, - output_bias=vocab_size) - d2 <- list(word_embedding=numhid1, - embed_to_hid=numhid2, - hid_to_output=vocab_size, - hid_bias=1, - output_bias=1) + tmp <- size(data$train_input) + numwords <- tmp[1] + #batchsize <- tmp[2] + #numbatches <- tmp[3] + vocab_size <- size(data$vocab, 2) - - weights <- c(mapply(function(x, y, init_wt) init_wt*randn(x,y), x=d1[1:3], y=d2[1:3], MoreArgs=list(init_wt=init_wt)), - mapply(sparseVector, length=d1[4:5], MoreArgs=list(x=0), i=0)) # mapply(rep.int, times=d1[4:5], MoreArgs=list(x=0)) - - deltas <- c(mapply(zeros, d1[1:3], d2[1:3]), - mapply(sparseVector, length=d1[4:5], MoreArgs=list(x=0, i=0))) # mapply(rep.int, times=d1[4:5], MoreArgs=list(x=0)) + word_embedding_weights = init_wt * randn(vocab_size, numhid1); + embed_to_hid_weights = init_wt * randn(numwords * numhid1, numhid2); + hid_to_output_weights = init_wt * randn(numhid2, vocab_size); + hid_bias = zeros(numhid2, 1); + output_bias = zeros(vocab_size, 1); - gradients <- list(word_embedding=zeros(d1[1], d2[1]), - embed_to_hid=NULL, - hid_to_output=NULL, - hid_bias=NULL, - output_bias=NULL) - - expansion_matrix = .symDiagonal(vocab_size) # expanding the matrix is faster on a Diagonal class; .symDiagonal faster than just Diagonal here (b/c we are indexing it) - #expansion_matrix = eye(vocab_size) + word_embedding_weights_delta = zeros(vocab_size, numhid1); + word_embedding_weights_gradient = zeros(vocab_size, numhid1); + embed_to_hid_weights_delta = zeros(numwords * numhid1, numhid2); + hid_to_output_weights_delta = zeros(numhid2, vocab_size); + hid_bias_delta = zeros(numhid2, 1); + output_bias_delta = zeros(vocab_size, 1); + expansion_matrix = eye(vocab_size); count = 0 tiny = exp(-30) - - datasetsize = ncol(data$valid_input) - #datasetsize = - expanded_valid_target = expansion_matrix[, as.integer(data$valid_target)] - - - - -# % TRAIN. + + # % TRAIN. for(epoch in 1:epochs) { myPrintf('Epoch %d\n', epoch) this_chunk_CE <- 0 trainset_CE <- 0 -# % LOOP OVER MINI-BATCHES. - for(m in 1:numbatches) { - - input_batch <- data$train_input[[m]] - target_batch <- data$train_target[[m]] -# % FORWARD PROPAGATE. -# % Compute the state of each layer in the network given the input batch and all weights and biases + inputIT <- ihasNext(isplitCols(data$train_input, chunkSize=batchsize)) + targetIT <- ihasNext(isplitCols(data$train_target, chunkSize=batchsize)) + m <- 0 + # % LOOP OVER MINI-BATCHES. + #for(m in 1:numbatches) { + while(hasNext(inputIT) & hasNext(targetIT)) { + m <- m + 1 + + #input_batch <- data$train_input[,,m] + #target_batch <- data$train_target[,,m] + input_batch <- nextElem(inputIT) + target_batch <- nextElem(targetIT) + + # % FORWARD PROPAGATE. + # % Compute the state of each layer in the network given the input batch and all weights and biases # returns the embedding, hidden and output layer states - neural_net_states <- fprop(input_batch, weights, repmat) - -# benchmark( -# neural_net_states <- fprop(input_batch, weights, repmat), -# neural_net_states2 <- fprop(input_batch, weights, myRepMat), -# neural_net_states3 <- fprop(input_batch, weights, myRepMat2), -# neural_net_states3 <- fprop(input_batch, weights, myRepMat3), -# neural_net_states4 <- fprop(input_batch, weights, myRepMat4), -# replications=10 -# ) - -# % COMPUTE DERIVATIVE. -# %% Expand the target to a sparse 1-of-K vector. + neural_net_states <- fprop(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) + + # benchmark( + # neural_net_states <- fprop(input_batch, weights, repmat), + # neural_net_states2 <- fprop(input_batch, weights, myRepMat), + # neural_net_states3 <- fprop(input_batch, weights, myRepMat2), + # neural_net_states3 <- fprop(input_batch, weights, myRepMat3), + # neural_net_states4 <- fprop(input_batch, weights, myRepMat4), + # replications=10 + # ) + + # % COMPUTE DERIVATIVE. + # %% Expand the target to a sparse 1-of-K vector. expanded_target_batch = expansion_matrix[, as.integer(target_batch)] -# %% Compute derivative of cross-entropy loss function. + # %% Compute derivative of cross-entropy loss function. error_deriv = neural_net_states$output_layer_state - expanded_target_batch -# % MEASURE LOSS FUNCTION. - CE = -sum(colSums(expanded_target_batch * log(neural_net_states$output_layer_state + tiny))) / batchsize + # % MEASURE LOSS FUNCTION. + CE = -matlab::sum(matlab::sum(expanded_target_batch * log(neural_net_states$output_layer_state + tiny))) / batchsize count = count + 1 this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count trainset_CE = trainset_CE + (CE - trainset_CE) / m @@ -122,79 +111,97 @@ train <- function(epochs) { this_chunk_CE = 0 } -# % BACK PROPAGATE. -# %% OUTPUT LAYER. + # % BACK PROPAGATE. + # %% OUTPUT LAYER. # tcrossprod == x %*% t(y) - gradients$hid_to_output = myTCrossProd(neural_net_states$hidden_layer_state, error_deriv) - gradients$output_bias = rowSums(error_deriv) - back_propagated_deriv_1 = myMatMult(weights$hid_to_output,error_deriv) * neural_net_states$hidden_layer_state * (1 - neural_net_states$hidden_layer_state) - -# %% HIDDEN LAYER. -# % FILL IN CODE. Replace the line below by one of the options. -# gradients$embed_to_hid = zeros(numhid1 * numwords, numhid2); -# % Options: -# % (a) embed_to_hid_weights_gradient = t(back_propagated_deriv_1) %*% embedding_layer_state; - gradients$embed_to_hid = myTCrossProd(neural_net_states$embedding_layer_state, back_propagated_deriv_1); -# % (c) embed_to_hid_weights_gradient = back_propagated_deriv_1; -# % (d) embed_to_hid_weights_gradient = embedding_layer_state; - -# % FILL IN CODE. Replace the line below by one of the options. -# gradients$hid_bias = zeros(numhid2, 1); -# % Options - gradients$hid_bias = rowSums(back_propagated_deriv_1); -# % (b) hid_bias_gradient = apply(back_propagated_deriv_1, 2, sum); -# % (c) hid_bias_gradient = back_propagated_deriv_1; -# % (d) hid_bias_gradient = back_propagated_deriv_1'; - -# % FILL IN CODE. Replace the line below by one of the options. -# back_propagated_deriv_2 = zeros(numhid2, batchsize); -# % Options - back_propagated_deriv_2 = myMatMult(weights$embed_to_hid, back_propagated_deriv_1); -# % (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights; -# % (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights; -# % (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights'; + #gradients$hid_to_output = myTCrossProd(neural_net_states$hidden_layer_state, error_deriv) + #gradients$output_bias = rowSums(error_deriv) + #back_propagated_deriv_1 = myMatMult(weights$hid_to_output,error_deriv) * neural_net_states$hidden_layer_state * (1 - neural_net_states$hidden_layer_state) + hid_to_output_weights_gradient <- neural_net_states$hidden_layer_state %*% t(error_deriv) + output_bias_gradient <- rowSums(error_deriv) + back_propagated_deriv_1 <- (hid_to_output_weights %*% error_deriv) * neural_net_states$hidden_layer_state * (1 - neural_net_states$hidden_layer_state) + + + # %% HIDDEN LAYER. + # % FILL IN CODE. Replace the line below by one of the options. + # gradients$embed_to_hid = zeros(numhid1 * numwords, numhid2); + # % Options: + # % (a) embed_to_hid_weights_gradient = t(back_propagated_deriv_1) %*% embedding_layer_state; + #gradients$embed_to_hid = myTCrossProd(neural_net_states$embedding_layer_state, back_propagated_deriv_1); + embed_to_hid_weights_gradient <- neural_net_states$embedding_layer_state %*% t(back_propagated_deriv_1) + # % (c) embed_to_hid_weights_gradient = back_propagated_deriv_1; + # % (d) embed_to_hid_weights_gradient = embedding_layer_state; + + # % FILL IN CODE. Replace the line below by one of the options. + # gradients$hid_bias = zeros(numhid2, 1); + # % Options + #gradients$hid_bias = rowSums(back_propagated_deriv_1); + hid_bias_gradient <- rowSums(back_propagated_deriv_1) + # % (b) hid_bias_gradient = apply(back_propagated_deriv_1, 2, sum); + # % (c) hid_bias_gradient = back_propagated_deriv_1; + # % (d) hid_bias_gradient = back_propagated_deriv_1'; + + # % FILL IN CODE. Replace the line below by one of the options. + # back_propagated_deriv_2 = zeros(numhid2, batchsize); + # % Options + #back_propagated_deriv_2 = myMatMult(weights$embed_to_hid, back_propagated_deriv_1); + back_propagated_deriv_2 <- embed_to_hid_weights %*% back_propagated_deriv_1 + # % (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights; + # % (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights; + # % (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights'; #gradients$word_embedding[,] = 0; -# %% EMBEDDING LAYER. - - gradients$word_embedding[,] <- 0 + # %% EMBEDDING LAYER. + + word_embedding_weights_gradient[,] <- 0 for(w in 1:numwords) { - tmp <- back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ] - gradients$word_embedding= gradients$word_embedding + myTCrossProd(expansion_matrix[, as.integer(input_batch[w, ])], tmp) + #tmp <- back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ] + #gradients$word_embedding= gradients$word_embedding + myTCrossProd(expansion_matrix[, as.integer(input_batch[w, ])], tmp) + word_embedding_weights_gradient <- word_embedding_weights_gradient + + expansion_matrix[,as.integer(input_batch[w,])] %*% + t(back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ]) } -# system.time({ -# embeddingFn <- function(w, back_propogated_deriv_2, numhid1, input_batch, expansion_matrix) { -# tmp <- back_propagated_deriv_2[(1 + (w - 1) * numhid1):(w * numhid1), ] -# myTCrossProd(expansion_matrix[, input_batch[w, ]], tmp) -# } -# w <- mclapply(1:numwords, embeddingFn, back_propogated_deriv_2=back_propogated_deriv_2, numhid1=numhid1, input_batch=input_batch, expansion_matrix=expansion_matrix) -# for(exMat in w) gradients$word_embedding <- gradients$word_embedding + exMat -# }) + + # % UPDATE WEIGHTS AND BIASES. + + word_embedding_weights_delta = momentum * word_embedding_weights_delta + + word_embedding_weights_gradient / batchsize; + word_embedding_weights = word_embedding_weights - (learning_rate * word_embedding_weights_delta); + + embed_to_hid_weights_delta = momentum * embed_to_hid_weights_delta + + embed_to_hid_weights_gradient / batchsize; + embed_to_hid_weights = embed_to_hid_weights - (learning_rate * embed_to_hid_weights_delta); -# % UPDATE WEIGHTS AND BIASES. + hid_to_output_weights_delta = momentum * hid_to_output_weights_delta + + hid_to_output_weights_gradient / batchsize; + hid_to_output_weights = hid_to_output_weights - (learning_rate * hid_to_output_weights_delta); - #deltas <- mapply(delta_update_fn, delta=deltas, gradient=gradients, MoreArgs=list(momentum=momentum, batchsize=batchsize)) - deltas <- mcmapply(delta_update_fn, delta=deltas, gradient=gradients, MoreArgs=list(momentum=momentum, batchsize=batchsize), mc.cores=5) + hid_bias_delta = momentum * hid_bias_delta + + hid_bias_gradient / batchsize; + hid_bias = hid_bias - (learning_rate * hid_bias_delta); - #weights <- mapply(weights_update_fn, delta=deltas, weight=weights, MoreArgs=list(learning_rate=learning_rate)) - weights <- mcmapply(weights_update_fn, delta=deltas, weight=weights, MoreArgs=list(learning_rate=learning_rate), mc.cores=5) + output_bias_delta = momentum * output_bias_delta + + output_bias_gradient / batchsize; + output_bias = output_bias - (learning_rate * output_bias_delta); -# % VALIDATE. + # % VALIDATE. if(mod(m, show_validation_CE_after) == 0) { myPrintf('\rRunning validation ...') - neural_net_states <- fprop(data$valid_input, weights, fn=repmat) -# benchmark( -# neural_net_states <- fprop(data$valid_input, weights, fn=repmat), -# neural_net_states2 <- fprop(data$valid_input, weights, fn=myRepMat), -# neural_net_states3 <- fprop(data$valid_input, weights, myRepMat2), -# neural_net_states4 <- fprop(data$valid_input, weights, myRepMat3), -# neural_net_states5 <- fprop(data$valid_input, weights, myRepMat4), -# replications=2 -# ) - + neural_net_states <- fprop(data$valid_input, word_embedding_weights, embed_to_hid_weights, + hid_to_output_weights, hid_bias, output_bias) + # benchmark( + # neural_net_states <- fprop(data$valid_input, weights, fn=repmat), + # neural_net_states2 <- fprop(data$valid_input, weights, fn=myRepMat), + # neural_net_states3 <- fprop(data$valid_input, weights, myRepMat2), + # neural_net_states4 <- fprop(data$valid_input, weights, myRepMat3), + # neural_net_states5 <- fprop(data$valid_input, weights, myRepMat4), + # replications=2 + # ) - CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize + datasetsize <- size(data$valid_input, 2) + expanded_valid_target <- expansion_matrix[, as.integer(data$valid_target)] + CE = -matlab::sum(matlab::sum(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize myPrintf(' Validation CE %.3f\n', CE) } } @@ -203,31 +210,38 @@ train <- function(epochs) { myPrintf('Finished Training.\n') myPrintf('Final Training CE %.3f\n', trainset_CE) -# % EVALUATE ON VALIDATION SET. + # % EVALUATE ON VALIDATION SET. myPrintf('\rRunning validation ...') - neural_net_states <- fprop(data$valid_input, weights, fn=repmat) - CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize + neural_net_states <- fprop(data$valid_input, word_embedding_weights, embed_to_hid_weights, + hid_to_output_weights, hid_bias, output_bias) + datasetsize <- size(data$valid_input, 2) + expanded_valid_target <- expansion_matrix[, as.integer(data$valid_target)] + CE = -matlab::sum(matlab::sum(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) /datasetsize myPrintf('\rFinal Validation CE %.3f\n', CE) - -# % EVALUATE ON TEST SET. + + # % EVALUATE ON TEST SET. myPrintf('\rRunning test ...'); - neural_net_states <- fprop(data$test_input, weights, fn=repmat) - + neural_net_states <- fprop(data$test_input, word_embedding_weights, embed_to_hid_weights, + hid_to_output_weights, hid_bias, output_bias) datasetsize = size(data$valid_input, 2); expanded_valid_target = expansion_matrix[, as.integer(data$test_target)]; - CE = -sum(colSums(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) / datasetsize + CE = -matlab::sum(matlab::sum(expanded_valid_target * log(neural_net_states$output_layer_state + tiny))) / datasetsize myPrintf('\rFinal Test CE %.3f\n', CE) end_time <- proc.time() print(end_time - start_time) - return(list(weights=weights, vocab=data$vocab)) + return(list(word_embedding_weights = word_embedding_weights, + embed_to_hid_weights = embed_to_hid_weights, + hid_to_output_weights = hid_to_output_weights, + hid_bias = hid_bias, + output_bias = output_bias, + vocab=data$vocab)) } -delta_update_fn <- function(delta, gradient, momentum, batchsize) { momentum * delta + gradient / batchsize} -weights_update_fn <- function(delta, weight, learning_rate) {weight - learning_rate * delta} + # Rprof() # model <- train(1) diff --git a/Neural Net Language Model/Train_Original.R b/Neural Net Language Model/Train_Original.R index 74cbe31..4f1ad04 100644 --- a/Neural Net Language Model/Train_Original.R +++ b/Neural Net Language Model/Train_Original.R @@ -14,7 +14,7 @@ source("Neural Net Language Model/FProp_Original.R") # see also http://mathesaurus.sourceforge.net/octave-r.html # % This function trains a neural network language model. -train <- function(epochs) { +train_original <- function(epochs) { # % Inputs: # % epochs: Number of epochs to run. # % Output: @@ -36,7 +36,7 @@ train <- function(epochs) { show_validation_CE_after = 1000 # % LOAD DATA. - data <- load_data(batchsize) + data <- load_data_original(batchsize) tmp <- size(data$train_input) numwords <- tmp[1] batchsize <- tmp[2] @@ -73,14 +73,14 @@ train <- function(epochs) { # % FORWARD PROPAGATE. # % Compute the state of each layer in the network given the input batch and all weights and biases # returns the embedding, hidden and output layer states - neural_net_states <- fprop(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) + neural_net_states <- fprop_original(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) # benchmark( - # neural_net_states <- fprop(input_batch, weights, repmat), - # neural_net_states2 <- fprop(input_batch, weights, myRepMat), - # neural_net_states3 <- fprop(input_batch, weights, myRepMat2), - # neural_net_states3 <- fprop(input_batch, weights, myRepMat3), - # neural_net_states4 <- fprop(input_batch, weights, myRepMat4), + # neural_net_states <- fprop_original(input_batch, weights, repmat), + # neural_net_states2 <- fprop_original(input_batch, weights, myRepMat), + # neural_net_states3 <- fprop_original(input_batch, weights, myRepMat2), + # neural_net_states3 <- fprop_original(input_batch, weights, myRepMat3), + # neural_net_states4 <- fprop_original(input_batch, weights, myRepMat4), # replications=10 # ) @@ -180,14 +180,14 @@ train <- function(epochs) { # % VALIDATE. if(mod(m, show_validation_CE_after) == 0) { myPrintf('\rRunning validation ...') - neural_net_states <- fprop(data$valid_input, word_embedding_weights, embed_to_hid_weights, + neural_net_states <- fprop_original(data$valid_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) # benchmark( - # neural_net_states <- fprop(data$valid_input, weights, fn=repmat), - # neural_net_states2 <- fprop(data$valid_input, weights, fn=myRepMat), - # neural_net_states3 <- fprop(data$valid_input, weights, myRepMat2), - # neural_net_states4 <- fprop(data$valid_input, weights, myRepMat3), - # neural_net_states5 <- fprop(data$valid_input, weights, myRepMat4), + # neural_net_states <- fprop_original(data$valid_input, weights, fn=repmat), + # neural_net_states2 <- fprop_original(data$valid_input, weights, fn=myRepMat), + # neural_net_states3 <- fprop_original(data$valid_input, weights, myRepMat2), + # neural_net_states4 <- fprop_original(data$valid_input, weights, myRepMat3), + # neural_net_states5 <- fprop_original(data$valid_input, weights, myRepMat4), # replications=2 # ) @@ -205,7 +205,7 @@ train <- function(epochs) { # % EVALUATE ON VALIDATION SET. myPrintf('\rRunning validation ...') - neural_net_states <- fprop(data$valid_input, word_embedding_weights, embed_to_hid_weights, + neural_net_states <- fprop_original(data$valid_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) datasetsize <- size(data$valid_input, 2) expanded_valid_target <- expansion_matrix[, as.integer(data$valid_target)] @@ -215,7 +215,7 @@ train <- function(epochs) { # % EVALUATE ON TEST SET. myPrintf('\rRunning test ...'); - neural_net_states <- fprop(data$test_input, word_embedding_weights, embed_to_hid_weights, + neural_net_states <- fprop_original(data$test_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) datasetsize = size(data$valid_input, 2); expanded_valid_target = expansion_matrix[, as.integer(data$test_target)];