Skip to content

Commit

Permalink
Implement LayerNormalization
Browse files Browse the repository at this point in the history
  • Loading branch information
Dobiasd committed Nov 28, 2023
1 parent 352aaae commit f7630b1
Show file tree
Hide file tree
Showing 7 changed files with 280 additions and 202 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ Would you like to build/train a model using Keras/Python? And would you like to
* `BatchNormalization`, `Dense`, `Flatten`, `Normalization`
* `Dropout`, `AlphaDropout`, `GaussianDropout`, `GaussianNoise`
* `SpatialDropout1D`, `SpatialDropout2D`, `SpatialDropout3D`
* `ActivityRegularization`
* `ActivityRegularization`, `LayerNormalization`
* `RandomContrast`, `RandomFlip`, `RandomHeight`
* `RandomRotation`, `RandomTranslation`, `RandomWidth`, `RandomZoom`
* `MaxPooling1D/2D/3D`, `GlobalMaxPooling1D/2D/3D`
Expand Down Expand Up @@ -77,7 +77,7 @@ Would you like to build/train a model using Keras/Python? And would you like to
`Conv3D`, `ConvLSTM1D`, `ConvLSTM2D`, `Discretization`,
`GRUCell`, `Hashing`,
`IntegerLookup`,
`LayerNormalization`, `LocallyConnected1D`, `LocallyConnected2D`,
`LocallyConnected1D`, `LocallyConnected2D`,
`LSTMCell`, `Masking`, `MultiHeadAttention`,
`RepeatVector`, `RNN`, `SimpleRNN`,
`SimpleRNNCell`, `StackedRNNCells`, `StringLookup`, `TextVectorization`,
Expand Down
18 changes: 18 additions & 0 deletions include/fdeep/import_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "fdeep/layers/hard_sigmoid_layer.hpp"
#include "fdeep/layers/input_layer.hpp"
#include "fdeep/layers/layer.hpp"
#include "fdeep/layers/layer_normalization_layer.hpp"
#include "fdeep/layers/leaky_relu_layer.hpp"
#include "fdeep/layers/embedding_layer.hpp"
#include "fdeep/layers/lstm_layer.hpp"
Expand Down Expand Up @@ -536,6 +537,22 @@ inline layer_ptr create_batch_normalization_layer(const get_param_f& get_param,
name, axis, moving_mean, moving_variance, beta, gamma, epsilon);
}


inline layer_ptr create_layer_normalization_layer(const get_param_f& get_param,
const nlohmann::json& data, const std::string& name)
{
const bool center = data["config"]["center"];
const bool scale = data["config"]["scale"];
const auto axes = create_vector<int>(create_int, data["config"]["axis"]);
const float_type epsilon = data["config"]["epsilon"];
float_vec gamma;
float_vec beta;
if (scale) gamma = decode_floats(get_param(name, "gamma"));
if (center) beta = decode_floats(get_param(name, "beta"));
return std::make_shared<layer_normalization_layer>(
name, axes, beta, gamma, epsilon);
}

inline layer_ptr create_identity_layer(
const get_param_f&, const nlohmann::json&, const std::string& name)
{
Expand Down Expand Up @@ -1283,6 +1300,7 @@ inline layer_ptr create_layer(const get_param_f& get_param,
{"DepthwiseConv2D", create_depthwise_conv_2D_layer},
{"InputLayer", create_input_layer},
{"BatchNormalization", create_batch_normalization_layer},
{"LayerNormalization", create_layer_normalization_layer},
{"Dropout", create_identity_layer},
{"ActivityRegularization", create_identity_layer},
{"AlphaDropout", create_identity_layer},
Expand Down
212 changes: 23 additions & 189 deletions include/fdeep/layers/batch_normalization_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,204 +27,38 @@ class batch_normalization_layer : public layer
float_type epsilon)
: layer(name),
axis_(axis),
moving_mean_(moving_mean),
moving_variance_(moving_variance),
beta_(beta),
gamma_(gamma),
moving_mean_(fplus::make_shared_ref<float_vec>(moving_mean)),
moving_variance_(fplus::make_shared_ref<float_vec>(moving_variance)),
beta_(fplus::make_shared_ref<float_vec>(beta)),
gamma_(fplus::make_shared_ref<float_vec>(gamma)),
epsilon_(epsilon)
{
assertion(moving_variance.size() == moving_mean.size(), "invalid sizes");
assertion(beta.empty() || beta.size() == moving_mean.size(), "invalid sizes");
assertion(gamma.empty() || gamma.size() == moving_mean.size(), "invalid sizes");
}
protected:
int axis_;
float_vec moving_mean_;
float_vec moving_variance_;
float_vec beta_;
float_vec gamma_;
shared_float_vec moving_mean_;
shared_float_vec moving_variance_;
shared_float_vec beta_;
shared_float_vec gamma_;
float_type epsilon_;

static float_type apply_to_value_gamma_beta(
const float_vec& moving_mean, const float_vec& beta, const float_vec& gamma,
float_type val, float_type denom, std::size_t z)
{
val -= moving_mean[z];
val *= gamma[z];
val /= denom;
val += beta[z];
return val;
}

static float_type apply_to_value_gamma(
const float_vec& moving_mean, const float_vec&, const float_vec& gamma,
float_type val, float_type denom, std::size_t z)
{
val -= moving_mean[z];
val *= gamma[z];
val /= denom;
return val;
}

static float_type apply_to_value_beta(
const float_vec& moving_mean, const float_vec& beta, const float_vec&,
float_type val, float_type denom, std::size_t z)
{
val -= moving_mean[z];
val /= denom;
val += beta[z];
return val;
}

static float_type apply_to_value(
const float_vec& moving_mean, const float_vec&, const float_vec&,
float_type val, float_type denom, std::size_t z)
{
val -= moving_mean[z];
val /= denom;
return val;
}

template <typename F>
static void apply_to_channel(const F f,
const float_vec& moving_mean, const float_vec& beta, const float_vec& gamma,
const tensor& input, tensor& output, float_type denom, std::size_t z, std::size_t dim5, std::size_t dim4)
{
for (std::size_t y = 0; y < output.shape().height_; ++y)
{
for (std::size_t x = 0; x < output.shape().width_; ++x)
{
output.set_ignore_rank(tensor_pos(dim5, dim4, y, x, z),
f(moving_mean, beta, gamma,
input.get_ignore_rank(tensor_pos(dim5, dim4, y, x, z)), denom, z));
}
}
}

tensor apply_to_slices(const tensor& input) const
{
assertion(moving_mean_.size() == input.shape().depth_,
"invalid beta");
assertion(moving_variance_.size() == input.shape().depth_,
"invalid beta");

const bool use_gamma = !gamma_.empty();
if (use_gamma)
{
assertion(gamma_.size() == input.shape().depth_, "invalid gamma");
}

const bool use_beta = !beta_.empty();
if (use_beta)
{
assertion(beta_.size() == input.shape().depth_, "invalid beta");
}

tensor output(input.shape(), 0);

const auto denoms = fplus::transform([this](const auto& mv)
{ return std::sqrt(mv + this->epsilon_); },
moving_variance_);

if (use_gamma && use_beta) {
for (std::size_t dim5 = 0; dim5 < output.shape().size_dim_5_; ++dim5)
{
for (std::size_t dim4 = 0; dim4 < output.shape().size_dim_4_; ++dim4)
{
for (std::size_t z = 0; z < output.shape().depth_; ++z)
{
apply_to_channel(apply_to_value_gamma_beta, moving_mean_, beta_, gamma_, input, output, denoms[z], z, dim5, dim4);
}
}
}
}
else if (use_gamma) {
for (std::size_t dim5 = 0; dim5 < output.shape().size_dim_5_; ++dim5)
{
for (std::size_t dim4 = 0; dim4 < output.shape().size_dim_4_; ++dim4)
{
for (std::size_t z = 0; z < output.shape().depth_; ++z)
{
apply_to_channel(apply_to_value_gamma, moving_mean_, beta_, gamma_, input, output, denoms[z], z, dim5, dim4);
}
}
}
}
else if (use_beta) {
for (std::size_t dim5 = 0; dim5 < output.shape().size_dim_5_; ++dim5)
{
for (std::size_t dim4 = 0; dim4 < output.shape().size_dim_4_; ++dim4)
{
for (std::size_t z = 0; z < output.shape().depth_; ++z)
{
apply_to_channel(apply_to_value_beta, moving_mean_, beta_, gamma_, input, output, denoms[z], z, dim5, dim4);
}
}
}
}
else {
for (std::size_t dim5 = 0; dim5 < output.shape().size_dim_5_; ++dim5)
{
for (std::size_t dim4 = 0; dim4 < output.shape().size_dim_4_; ++dim4)
{
for (std::size_t z = 0; z < output.shape().depth_; ++z)
{
apply_to_channel(apply_to_value, moving_mean_, beta_, gamma_, input, output, denoms[z], z, dim5, dim4);
}
}
}
}
return output;
}

tensors apply_impl(const tensors& inputs) const override
{
const auto& input = single_tensor_from_tensors(inputs);
const int adjusted_axis =
axis_ == -1
? 5
: 5 + axis_ - static_cast<int>(input.shape().rank());

if (adjusted_axis == 5)
{
return {apply_to_slices(input)};
}
else if (adjusted_axis == 4)
{
return {tensor_with_changed_rank(
permute_tensor(apply_to_slices(permute_tensor(
tensor_with_changed_rank(input, 5),
{1, 2, 3, 5, 4})),
{1, 2, 3, 5, 4}), input.shape().rank())};
}
else if (adjusted_axis == 3)
{
return {tensor_with_changed_rank(
permute_tensor(apply_to_slices(permute_tensor(
tensor_with_changed_rank(input, 5),
{1, 2, 5, 4, 3})),
{1, 2, 5, 4, 3}), input.shape().rank())};
}
else if (adjusted_axis == 2)
{
return {tensor_with_changed_rank(
permute_tensor(apply_to_slices(permute_tensor(
tensor_with_changed_rank(input, 5),
{1, 5, 3, 4, 2})),
{1, 5, 3, 4, 2}), input.shape().rank())};
}
else if (adjusted_axis == 1)
{
return {tensor_with_changed_rank(
permute_tensor(apply_to_slices(permute_tensor(
tensor_with_changed_rank(input, 5),
{5, 2, 3, 4, 1})),
{5, 2, 3, 4, 1}), input.shape().rank())};
}
else {
raise_error("Invalid axis for batch normalization.");
// Just to make the compiler happy.
// In reality, this is never called.
return inputs;
}

const auto input = single_tensor_from_tensors(inputs);
std::vector<std::size_t> dims(5, 1);
dims[rank_aligned_axis_to_absolute_axis(input.shape().rank(), axis_) - 1] = moving_mean_->size();
const tensor_shape params_shape = create_tensor_shape_from_dims(dims);
return {batch_normalization(
input,
broadcast(tensor(params_shape, moving_mean_), input.shape()),
broadcast(tensor(params_shape, moving_variance_), input.shape()),
beta_->empty() ? tensor(input.shape(), 0) : broadcast(tensor(params_shape, beta_), input.shape()),
gamma_->empty() ? tensor(input.shape(), 1) : broadcast(tensor(params_shape, gamma_), input.shape()),
epsilon_
)};
}
};

Expand Down
66 changes: 66 additions & 0 deletions include/fdeep/layers/layer_normalization_layer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright 2016, Tobias Hermann.
// https://github.com/Dobiasd/frugally-deep
// Distributed under the MIT License.
// (See accompanying LICENSE file or at
// https://opensource.org/licenses/MIT)

#pragma once

#include "fdeep/layers/layer.hpp"
#include "fdeep/layers/batch_normalization_layer.hpp"

#include <string>

namespace fdeep { namespace internal
{

class layer_normalization_layer : public layer
{
public:
explicit layer_normalization_layer(const std::string& name,
std::vector<int> axes,
const float_vec& beta,
const float_vec& gamma,
float_type epsilon)
: layer(name),
axes_(axes),
beta_(fplus::make_shared_ref<float_vec>(beta)),
gamma_(fplus::make_shared_ref<float_vec>(gamma)),
epsilon_(epsilon)
{
}
protected:
std::vector<int> axes_;
shared_float_vec beta_;
shared_float_vec gamma_;
float_type epsilon_;

tensors apply_impl(const tensors& inputs) const override
{
// https://github.com/keras-team/keras/blob/v2.14.0/keras/layers/normalization/layer_normalization.py#L291-L304
const auto& input = single_tensor_from_tensors(inputs);
const auto& input_moments = moments(input, axes_);
const auto& mean = input_moments.first;
const auto& variance = input_moments.second;

std::vector<std::size_t> dims(5, 1);
tensor_shape input_shape = input.shape();
input_shape.maximize_rank();
const auto input_shape_dimensions = input_shape.dimensions();
for (const auto axis : axes_) {
const std::size_t pos = rank_aligned_axis_to_absolute_axis(input.shape().rank(), axis) - 1;
dims[pos] = input_shape_dimensions[pos];
}
const tensor_shape params_shape = create_tensor_shape_from_dims(dims);
return {batch_normalization(
input,
mean,
variance,
beta_->empty() ? tensor(input.shape(), 0) : broadcast(tensor(params_shape, beta_), input.shape()),
gamma_->empty() ? tensor(input.shape(), 1) : broadcast(tensor(params_shape, gamma_), input.shape()),
epsilon_
)};
}
};

} } // namespace fdeep, namespace internal
Loading

0 comments on commit f7630b1

Please sign in to comment.