diff --git a/NN/nnapplygrads.m b/NN/nnapplygrads.m index 781163b..649258d 100644 --- a/NN/nnapplygrads.m +++ b/NN/nnapplygrads.m @@ -10,7 +10,12 @@ dW = nn.dW{i}; end - dW = nn.learningRate * dW; + % to apply different learning rates to each layer + if isempty(nn.learningRatePerLayer) + dW = nn.learningRate * dW; + else + dW = nn.learningRatePerLayer(i) * dW; + end if(nn.momentum>0) nn.vW{i} = nn.momentum*nn.vW{i} + dW; diff --git a/NN/nnsetup.m b/NN/nnsetup.m index b8ec742..7a1162d 100644 --- a/NN/nnsetup.m +++ b/NN/nnsetup.m @@ -8,6 +8,7 @@ nn.activation_function = 'tanh_opt'; % Activation functions of hidden layers: 'sigm' (sigmoid) or 'tanh_opt' (optimal tanh). nn.learningRate = 2; % learning rate Note: typically needs to be lower when using 'sigm' activation function and non-normalized inputs. + nn.learningRatePerLayer = []; % learning rate per layer - for transfer learning pre-training and fine-tuning different parts of the network (should be of length nn.n - 1) nn.momentum = 0.5; % Momentum nn.scaling_learningRate = 1; % Scaling factor for the learning rate (each epoch) nn.weightPenaltyL2 = 0; % L2 regularization diff --git a/NN/nntrain.m b/NN/nntrain.m index af844a6..1e2d1e8 100644 --- a/NN/nntrain.m +++ b/NN/nntrain.m @@ -72,6 +72,17 @@ disp(['epoch ' num2str(i) '/' num2str(opts.numepochs) '. Took ' num2str(t) ' seconds' '. Mini-batch mean squared error on training set is ' num2str(mean(L((n-numbatches):(n-1)))) str_perf]); nn.learningRate = nn.learningRate * nn.scaling_learningRate; + if ~isempty(nn.learningRatePerLayer) + nn.learningRatePerLayer = nn.learningRatePerLayer * nn.scaling_learningRate; + end + + if isfield(opts,'tol') + if opts.validation == 1 && loss.val.e(end)