From a2992f940e0932bd65cc405b4b9435ed88b3ea68 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Mon, 14 Aug 2023 09:44:33 +0200 Subject: [PATCH 01/31] Add MultiHeadAttention layer --- include/fdeep/import_model.hpp | 18 ++++++++++ .../fdeep/layers/additive_attention_layer.hpp | 2 +- .../layers/multi_head_attention_layer.hpp | 33 +++++++++++++++++++ keras_export/convert_model.py | 6 ++++ keras_export/generate_test_models.py | 11 +++++++ 5 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 include/fdeep/layers/multi_head_attention_layer.hpp diff --git a/include/fdeep/import_model.hpp b/include/fdeep/import_model.hpp index 754da32a..e3f916bc 100644 --- a/include/fdeep/import_model.hpp +++ b/include/fdeep/import_model.hpp @@ -62,6 +62,7 @@ #include "fdeep/layers/maximum_layer.hpp" #include "fdeep/layers/minimum_layer.hpp" #include "fdeep/layers/model_layer.hpp" +#include "fdeep/layers/multi_head_attention_layer.hpp" #include "fdeep/layers/multiply_layer.hpp" #include "fdeep/layers/normalization_layer.hpp" #include "fdeep/layers/pooling_3d_layer.hpp" @@ -1043,6 +1044,22 @@ inline layer_ptr create_additive_attention_layer( return std::make_shared(name, scale); } +inline layer_ptr create_multi_head_attention_layer( + const get_param_f&, + const nlohmann::json&, const std::string& name) +{ + /* + const std::size_t num_heads = data["config"]["num_heads"]; + const std::size_t key_dim = data["config"]["key_dim"]; + const std::size_t value_dim = data["config"]["value_dim"]; + const auto attention_axes = + create_vector>(fplus::bind_1st_of_2( + create_vector, create_size_t), + data["config"]["padding"]); + */ + return std::make_shared(name); +} + inline std::string get_activation_type(const nlohmann::json& data) { assertion(data.is_string(), "Layer activation must be a string."); @@ -1351,6 +1368,7 @@ inline layer_ptr create_layer(const get_param_f& get_param, {"CategoryEncoding", create_category_encoding_layer}, {"Attention", create_attention_layer}, {"AdditiveAttention", create_additive_attention_layer}, + {"MultiHeadAttention", create_multi_head_attention_layer}, }; const wrapper_layer_creators wrapper_creators = { diff --git a/include/fdeep/layers/additive_attention_layer.hpp b/include/fdeep/layers/additive_attention_layer.hpp index a7f02688..6fb1d4f7 100644 --- a/include/fdeep/layers/additive_attention_layer.hpp +++ b/include/fdeep/layers/additive_attention_layer.hpp @@ -24,7 +24,7 @@ class additive_attention_layer : public layer protected: tensors apply_impl(const tensors& input) const override { - assertion(input.size() == 2 or input.size() == 3, "Invalid number of inputs for Attention layer."); + assertion(input.size() == 2 or input.size() == 3, "Invalid number of inputs for AdditiveAttention layer."); const tensor& query = input[0]; const tensor& value = input[1]; const tensor& key = input.size() > 2 ? input[2] : value; diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp new file mode 100644 index 00000000..4d72b840 --- /dev/null +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -0,0 +1,33 @@ +// Copyright 2016, Tobias Hermann. +// https://github.com/Dobiasd/frugally-deep +// Distributed under the MIT License. +// (See accompanying LICENSE file or at +// https://opensource.org/licenses/MIT) + +#pragma once + +#include "fdeep/layers/layer.hpp" +#include "fdeep/layers/softmax_layer.hpp" + +#include + +namespace fdeep { namespace internal +{ + +class multi_head_attention_layer : public layer +{ +public: + explicit multi_head_attention_layer(const std::string& name) + : layer(name) + { + } +protected: + tensors apply_impl(const tensors& input) const override + { + assertion(input.size() == 2 or input.size() == 3, "Invalid number of inputs for MultiHeadAttention layer."); + // todo: implement + return input; + } +}; + +} } // namespace fdeep, namespace internal diff --git a/keras_export/convert_model.py b/keras_export/convert_model.py index 1287ea17..9b13833e 100755 --- a/keras_export/convert_model.py +++ b/keras_export/convert_model.py @@ -549,6 +549,11 @@ def show_additive_attention_layer(layer): return data +def show_multi_head_attention_layer(layer): + """Serialize MultiHeadAttention layer to dict""" + assert layer._output_shape is None + + def get_layer_functions_dict(): return { 'Conv1D': show_conv_1d_layer, @@ -575,6 +580,7 @@ def get_layer_functions_dict(): 'CategoryEncoding': show_category_encoding_layer, 'Attention': show_attention_layer, 'AdditiveAttention': show_additive_attention_layer, + 'MultiHeadAttention': show_multi_head_attention_layer, } diff --git a/keras_export/generate_test_models.py b/keras_export/generate_test_models.py index 00983bf5..809abc9f 100644 --- a/keras_export/generate_test_models.py +++ b/keras_export/generate_test_models.py @@ -23,6 +23,7 @@ from tensorflow.keras.layers import MaxPooling1D, AveragePooling1D, UpSampling1D from tensorflow.keras.layers import MaxPooling2D, AveragePooling2D, UpSampling2D from tensorflow.keras.layers import MaxPooling3D, AveragePooling3D +from tensorflow.keras.layers import MultiHeadAttention from tensorflow.keras.layers import Multiply, Add, Subtract, Average, Maximum, Minimum, Dot from tensorflow.keras.layers import Permute, Reshape, RepeatVector from tensorflow.keras.layers import SeparableConv2D, DepthwiseConv2D @@ -402,6 +403,16 @@ def get_test_model_exhaustive(): outputs.append(AdditiveAttention(use_scale=True)([inputs[49], inputs[50]])) outputs.append(AdditiveAttention(use_scale=True)([inputs[49], inputs[50], inputs[51]])) + outputs.append(MultiHeadAttention( + num_heads=1, key_dim=1, value_dim=None, + use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) + outputs.append(MultiHeadAttention( + num_heads=3, key_dim=2, value_dim=None, + use_bias=False, output_shape=None, attention_axes=(2, 3))(inputs[2], inputs[3])) + outputs.append(MultiHeadAttention( + num_heads=3, key_dim=2, value_dim=1, + use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) + shared_conv = Conv2D(1, (1, 1), padding='valid', name='shared_conv', activation='relu') From e0a886dcfce0851b0cdb635b0b347496c6550e7d Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Mon, 14 Aug 2023 10:21:22 +0200 Subject: [PATCH 02/31] Pass parameters to multi_head_attention_layer --- include/fdeep/import_model.hpp | 17 ++++++++--------- .../fdeep/layers/multi_head_attention_layer.hpp | 14 +++++++++++--- keras_export/convert_model.py | 2 ++ 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/include/fdeep/import_model.hpp b/include/fdeep/import_model.hpp index e3f916bc..8e5a2d7a 100644 --- a/include/fdeep/import_model.hpp +++ b/include/fdeep/import_model.hpp @@ -1045,19 +1045,18 @@ inline layer_ptr create_additive_attention_layer( } inline layer_ptr create_multi_head_attention_layer( - const get_param_f&, - const nlohmann::json&, const std::string& name) + const get_param_f& get_param, + const nlohmann::json& data, const std::string& name) { - /* const std::size_t num_heads = data["config"]["num_heads"]; const std::size_t key_dim = data["config"]["key_dim"]; const std::size_t value_dim = data["config"]["value_dim"]; - const auto attention_axes = - create_vector>(fplus::bind_1st_of_2( - create_vector, create_size_t), - data["config"]["padding"]); - */ - return std::make_shared(name); + const bool use_bias = data["config"]["use_bias"]; + const auto attention_axes = create_vector(create_size_t, + data["config"]["attention_axes"]); + const auto weights = create_vector(decode_floats, get_param(name, "weights")); + return std::make_shared(name, + num_heads, key_dim, value_dim, use_bias, attention_axes); } inline std::string get_activation_type(const nlohmann::json& data) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 4d72b840..0d6c5332 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -17,17 +17,25 @@ namespace fdeep { namespace internal class multi_head_attention_layer : public layer { public: - explicit multi_head_attention_layer(const std::string& name) - : layer(name) + explicit multi_head_attention_layer(const std::string& name, + std::size_t num_heads, std::size_t key_dim, std::size_t value_dim, + bool use_bias, const std::vector& attention_axes) + : layer(name), num_heads_(num_heads), key_dim_(key_dim), + value_dim_(value_dim), use_bias_(use_bias), attention_axes_(attention_axes) { } protected: tensors apply_impl(const tensors& input) const override { - assertion(input.size() == 2 or input.size() == 3, "Invalid number of inputs for MultiHeadAttention layer."); + // input.size() is 1. How shall the other tensors passed here? How is it in TF? // todo: implement return input; } + std::size_t num_heads_; + std::size_t key_dim_; + std::size_t value_dim_; + bool use_bias_; + std::vector attention_axes_; }; } } // namespace fdeep, namespace internal diff --git a/keras_export/convert_model.py b/keras_export/convert_model.py index 9b13833e..397f9000 100755 --- a/keras_export/convert_model.py +++ b/keras_export/convert_model.py @@ -552,6 +552,8 @@ def show_additive_attention_layer(layer): def show_multi_head_attention_layer(layer): """Serialize MultiHeadAttention layer to dict""" assert layer._output_shape is None + return {'weights': list(map(lambda w: encode_floats(w.numpy()), layer.weights))} + # todo: shapes too def get_layer_functions_dict(): From 931bc277dcf0669f56e704508b88db013b9822bd Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Mon, 14 Aug 2023 10:41:37 +0200 Subject: [PATCH 03/31] Persist weight shapes too --- include/fdeep/import_model.hpp | 7 ++++++- include/fdeep/layers/multi_head_attention_layer.hpp | 7 +++++-- keras_export/convert_model.py | 6 ++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/fdeep/import_model.hpp b/include/fdeep/import_model.hpp index 8e5a2d7a..5a01da47 100644 --- a/include/fdeep/import_model.hpp +++ b/include/fdeep/import_model.hpp @@ -1054,9 +1054,14 @@ inline layer_ptr create_multi_head_attention_layer( const bool use_bias = data["config"]["use_bias"]; const auto attention_axes = create_vector(create_size_t, data["config"]["attention_axes"]); + const auto weight_shapes = + create_vector>(fplus::bind_1st_of_2( + create_vector, create_size_t), + get_param(name, "weight_shapes")); const auto weights = create_vector(decode_floats, get_param(name, "weights")); + // todo: Convert weight_shapes and weights to Tensors before passing to ctor? return std::make_shared(name, - num_heads, key_dim, value_dim, use_bias, attention_axes); + num_heads, key_dim, value_dim, use_bias, attention_axes, weights); } inline std::string get_activation_type(const nlohmann::json& data) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 0d6c5332..c14439b7 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -19,9 +19,11 @@ class multi_head_attention_layer : public layer public: explicit multi_head_attention_layer(const std::string& name, std::size_t num_heads, std::size_t key_dim, std::size_t value_dim, - bool use_bias, const std::vector& attention_axes) + bool use_bias, const std::vector& attention_axes, + const std::vector& weights) : layer(name), num_heads_(num_heads), key_dim_(key_dim), - value_dim_(value_dim), use_bias_(use_bias), attention_axes_(attention_axes) + value_dim_(value_dim), use_bias_(use_bias), attention_axes_(attention_axes), + weights_(weights) { } protected: @@ -36,6 +38,7 @@ class multi_head_attention_layer : public layer std::size_t value_dim_; bool use_bias_; std::vector attention_axes_; + std::vector weights_; }; } } // namespace fdeep, namespace internal diff --git a/keras_export/convert_model.py b/keras_export/convert_model.py index 397f9000..bbb44b2e 100755 --- a/keras_export/convert_model.py +++ b/keras_export/convert_model.py @@ -552,8 +552,10 @@ def show_additive_attention_layer(layer): def show_multi_head_attention_layer(layer): """Serialize MultiHeadAttention layer to dict""" assert layer._output_shape is None - return {'weights': list(map(lambda w: encode_floats(w.numpy()), layer.weights))} - # todo: shapes too + return { + 'weight_shapes': list(map(lambda w: list(w.shape), layer.weights)), + 'weights': list(map(lambda w: encode_floats(w.numpy()), layer.weights)), + } def get_layer_functions_dict(): From 14aa03a54c65118bd416cf99a87f034bcf547d14 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Wed, 1 Nov 2023 06:49:47 +0100 Subject: [PATCH 04/31] Add comment with link to stackoverflow question --- include/fdeep/layers/multi_head_attention_layer.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index c14439b7..664401bf 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -30,6 +30,7 @@ class multi_head_attention_layer : public layer tensors apply_impl(const tensors& input) const override { // input.size() is 1. How shall the other tensors passed here? How is it in TF? + // https://stackoverflow.com/questions/77400589/what-is-the-reason-for-multiheadattention-having-a-different-call-convention-tha // todo: implement return input; } From e8efb244a42da3d7d2e22432dea25aa316a16e09 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Tue, 7 Nov 2023 08:24:30 +0100 Subject: [PATCH 05/31] Implement special treatment for the inbound_nodes format of MultiHeadAttention --- include/fdeep/import_model.hpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/fdeep/import_model.hpp b/include/fdeep/import_model.hpp index 5a01da47..9db2c3bf 100644 --- a/include/fdeep/import_model.hpp +++ b/include/fdeep/import_model.hpp @@ -1137,11 +1137,35 @@ inline node create_node(const nlohmann::json& inbound_nodes_data) inbound_nodes_data)); } +inline nodes create_multi_head_attention_nodes(const std::vector inbound_nodes_data) +{ + assertion(inbound_nodes_data.size() == 1 && inbound_nodes_data.front().size() == 1, + "multi_head_attention needs to have exactly one primary inbound node; see https://stackoverflow.com/q/77400589/1866775"); + const auto inbound_node_data = inbound_nodes_data.front().front(); + const auto value = inbound_node_data[3]["value"]; + if (json_obj_has_member(inbound_node_data[3], "key")) { + return { + node({ + create_node_connection(inbound_node_data), + create_node_connection(value), + create_node_connection(inbound_node_data[3]["key"]) + })}; + } + return { + node({ + create_node_connection(inbound_node_data), + create_node_connection(value) + })}; +} + inline nodes create_nodes(const nlohmann::json& data) { assertion(data["inbound_nodes"].is_array(), "no inbound nodes"); const std::vector inbound_nodes_data = data["inbound_nodes"]; + if (data["class_name"] == "MultiHeadAttention") { + return create_multi_head_attention_nodes(inbound_nodes_data); + } return fplus::transform(create_node, inbound_nodes_data); } From 7a7928dc1792b05fac42eab7246c9bc7a9d09ad4 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Tue, 7 Nov 2023 09:13:00 +0100 Subject: [PATCH 06/31] Adjust comment in multi_head_attention_layer --- include/fdeep/layers/multi_head_attention_layer.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 664401bf..f7c9dd46 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -29,9 +29,11 @@ class multi_head_attention_layer : public layer protected: tensors apply_impl(const tensors& input) const override { - // input.size() is 1. How shall the other tensors passed here? How is it in TF? - // https://stackoverflow.com/questions/77400589/what-is-the-reason-for-multiheadattention-having-a-different-call-convention-tha - // todo: implement + assertion(input.size() == 2 || input.size() == 3, "Invalid number of inputs for MultiHeadAttention layer."); + //const tensor& query = input[0]; + //const tensor& value = input[1]; + //const tensor& key = input.size() > 2 ? input[2] : value; + // https://github.com/keras-team/keras/blob/v2.14.0/keras/layers/attention/multi_head_attention.py return input; } std::size_t num_heads_; From 3bfbf0797edaeb395e59059adf336451c65b37f2 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sat, 16 Dec 2023 07:32:46 +0100 Subject: [PATCH 07/31] comment --- include/fdeep/layers/multi_head_attention_layer.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index f7c9dd46..b727f900 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -34,6 +34,7 @@ class multi_head_attention_layer : public layer //const tensor& value = input[1]; //const tensor& key = input.size() > 2 ? input[2] : value; // https://github.com/keras-team/keras/blob/v2.14.0/keras/layers/attention/multi_head_attention.py + // https://gist.github.com/sevagh/b71d253a347a9b59c026580625452fc5 return input; } std::size_t num_heads_; From 9743a49aafebcda5d7cbdaba0d9fa8119da491bc Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Tue, 26 Dec 2023 16:31:50 +0100 Subject: [PATCH 08/31] add comment --- include/fdeep/layers/multi_head_attention_layer.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index b727f900..63ea5333 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -33,6 +33,8 @@ class multi_head_attention_layer : public layer //const tensor& query = input[0]; //const tensor& value = input[1]; //const tensor& key = input.size() > 2 ? input[2] : value; + // https://towardsdatascience.com/transformers-explained-visually-part-3-multi-head-attention-deep-dive-1c1ff1024853 + // https://dmol.pub/dl/attention.html#multi-head-attention-block // https://github.com/keras-team/keras/blob/v2.14.0/keras/layers/attention/multi_head_attention.py // https://gist.github.com/sevagh/b71d253a347a9b59c026580625452fc5 return input; From 09d58682638d9c7005f7b0574fc021a966cb8acb Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Tue, 26 Dec 2023 20:46:48 +0100 Subject: [PATCH 09/31] Convert weights to tensors for ctor --- include/fdeep/import_model.hpp | 10 ++++++++-- include/fdeep/layers/multi_head_attention_layer.hpp | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/include/fdeep/import_model.hpp b/include/fdeep/import_model.hpp index 4110b350..6df58915 100644 --- a/include/fdeep/import_model.hpp +++ b/include/fdeep/import_model.hpp @@ -1083,8 +1083,14 @@ inline layer_ptr create_multi_head_attention_layer( create_vector>(fplus::bind_1st_of_2( create_vector, create_size_t), get_param(name, "weight_shapes")); - const auto weights = create_vector(decode_floats, get_param(name, "weights")); - // todo: Convert weight_shapes and weights to Tensors before passing to ctor? + const auto weight_values = create_vector(decode_floats, get_param(name, "weights")); + const auto weights = fplus::zip_with( + [](const std::vector& shape, const float_vec& values) -> tensor + { + return tensor( + create_tensor_shape_from_dims(shape), + fplus::convert_container(values)); + }, weight_shapes, weight_values); return std::make_shared(name, num_heads, key_dim, value_dim, use_bias, attention_axes, weights); } diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 63ea5333..4ad8c85c 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -20,7 +20,7 @@ class multi_head_attention_layer : public layer explicit multi_head_attention_layer(const std::string& name, std::size_t num_heads, std::size_t key_dim, std::size_t value_dim, bool use_bias, const std::vector& attention_axes, - const std::vector& weights) + const std::vector& weights) : layer(name), num_heads_(num_heads), key_dim_(key_dim), value_dim_(value_dim), use_bias_(use_bias), attention_axes_(attention_axes), weights_(weights) @@ -44,7 +44,7 @@ class multi_head_attention_layer : public layer std::size_t value_dim_; bool use_bias_; std::vector attention_axes_; - std::vector weights_; + std::vector weights_; }; } } // namespace fdeep, namespace internal From 212d60999d28dfa07cf1a3bb6a9c718ef8a649ab Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Tue, 26 Dec 2023 21:01:50 +0100 Subject: [PATCH 10/31] Separate weights and biases --- .../layers/multi_head_attention_layer.hpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 4ad8c85c..f649d3fe 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -20,12 +20,22 @@ class multi_head_attention_layer : public layer explicit multi_head_attention_layer(const std::string& name, std::size_t num_heads, std::size_t key_dim, std::size_t value_dim, bool use_bias, const std::vector& attention_axes, - const std::vector& weights) + const std::vector& saved_weights) : layer(name), num_heads_(num_heads), key_dim_(key_dim), - value_dim_(value_dim), use_bias_(use_bias), attention_axes_(attention_axes), - weights_(weights) + value_dim_(value_dim), attention_axes_(attention_axes), + weights_(extract_weights(saved_weights, use_bias)), + biases_(extract_biases(saved_weights, use_bias)) { } +private: + tensors extract_weights(const tensors& saved_weights, bool use_bias) + { + return use_bias ? fplus::unweave(saved_weights).first : saved_weights; + } + tensors extract_biases(const tensors& saved_weights, bool use_bias) + { + return use_bias ? fplus::unweave(saved_weights).second : tensors(); // todo: create biases with zeroes in right shape + } protected: tensors apply_impl(const tensors& input) const override { @@ -42,9 +52,9 @@ class multi_head_attention_layer : public layer std::size_t num_heads_; std::size_t key_dim_; std::size_t value_dim_; - bool use_bias_; std::vector attention_axes_; std::vector weights_; + std::vector biases_; }; } } // namespace fdeep, namespace internal From 118f663d7884f74994e080f75ccceaaaae201ab4 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Fri, 29 Dec 2023 16:20:09 +0100 Subject: [PATCH 11/31] apply dense layers to query, value and key --- include/fdeep/import_model.hpp | 4 +- .../layers/multi_head_attention_layer.hpp | 52 ++++++++++++++----- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/include/fdeep/import_model.hpp b/include/fdeep/import_model.hpp index 6df58915..8f0347d4 100644 --- a/include/fdeep/import_model.hpp +++ b/include/fdeep/import_model.hpp @@ -1084,7 +1084,7 @@ inline layer_ptr create_multi_head_attention_layer( create_vector, create_size_t), get_param(name, "weight_shapes")); const auto weight_values = create_vector(decode_floats, get_param(name, "weights")); - const auto weights = fplus::zip_with( + const auto weights_and_biases = fplus::zip_with( [](const std::vector& shape, const float_vec& values) -> tensor { return tensor( @@ -1092,7 +1092,7 @@ inline layer_ptr create_multi_head_attention_layer( fplus::convert_container(values)); }, weight_shapes, weight_values); return std::make_shared(name, - num_heads, key_dim, value_dim, use_bias, attention_axes, weights); + num_heads, key_dim, value_dim, use_bias, attention_axes, weights_and_biases); } inline std::string get_activation_type(const nlohmann::json& data) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index f649d3fe..e7c37492 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -7,6 +7,7 @@ #pragma once #include "fdeep/layers/layer.hpp" +#include "fdeep/layers/dense_layer.hpp" #include "fdeep/layers/softmax_layer.hpp" #include @@ -20,41 +21,68 @@ class multi_head_attention_layer : public layer explicit multi_head_attention_layer(const std::string& name, std::size_t num_heads, std::size_t key_dim, std::size_t value_dim, bool use_bias, const std::vector& attention_axes, - const std::vector& saved_weights) + const std::vector& weights_and_biases) : layer(name), num_heads_(num_heads), key_dim_(key_dim), value_dim_(value_dim), attention_axes_(attention_axes), - weights_(extract_weights(saved_weights, use_bias)), - biases_(extract_biases(saved_weights, use_bias)) + query_dense_(create_dense_layer(weights_and_biases, use_bias, 0, name + "_query_dense")), + value_dense_(create_dense_layer(weights_and_biases, use_bias, 1, name + "_value_dense")), + key_dense_(create_dense_layer(weights_and_biases, use_bias, 2, name + "_key_dense")), + output_dense_(create_dense_layer(weights_and_biases, use_bias, 3, name + "_output_dense")) { } private: - tensors extract_weights(const tensors& saved_weights, bool use_bias) + dense_layer create_dense_layer( + const tensors& weights_and_biases, bool use_bias, + std::size_t index, const std::string& name) { - return use_bias ? fplus::unweave(saved_weights).first : saved_weights; + const std::size_t index_factor = use_bias ? 2 : 1; + const tensor weights = weights_and_biases[index_factor * index]; + const std::size_t n = weights.shape().width_ * weights.shape().depth_; + const tensor biases = use_bias ? + weights_and_biases[index_factor * index + 1] : + tensor(tensor_shape(n), 1); + return dense_layer(name, n, *weights.as_vector(), *biases.as_vector()); } tensors extract_biases(const tensors& saved_weights, bool use_bias) { - return use_bias ? fplus::unweave(saved_weights).second : tensors(); // todo: create biases with zeroes in right shape + return use_bias ? fplus::unweave(saved_weights).second : tensors(); } protected: tensors apply_impl(const tensors& input) const override { assertion(input.size() == 2 || input.size() == 3, "Invalid number of inputs for MultiHeadAttention layer."); - //const tensor& query = input[0]; - //const tensor& value = input[1]; - //const tensor& key = input.size() > 2 ? input[2] : value; + const tensor query_raw = input[0]; + const tensor value_raw = input[1]; + const tensor key_raw = input.size() > 2 ? input[2] : value_raw; + const tensor query = query_dense_.apply({query_raw}).front(); + const tensor value = value_dense_.apply({value_raw}).front(); + const tensor key = key_dense_.apply({key_raw}).front(); + assertion( + query.shape().rank() == 2 && + value.shape().rank() == 2 && + key.shape().rank() == 2 && + query.shape().depth_ == value.shape().depth_ && + query.shape().depth_ == key.shape().depth_ && + value.shape().width_ == key.shape().width_, + "Invalid shapes; need a query tensor of shape (B, T, dim) and a value/key tensor of shape (B, S, dim)." + ); // https://towardsdatascience.com/transformers-explained-visually-part-3-multi-head-attention-deep-dive-1c1ff1024853 // https://dmol.pub/dl/attention.html#multi-head-attention-block // https://github.com/keras-team/keras/blob/v2.14.0/keras/layers/attention/multi_head_attention.py // https://gist.github.com/sevagh/b71d253a347a9b59c026580625452fc5 - return input; + const tensor scores = dot_product_tensors(query, transpose(key), std::vector({2, 1}), false); + const tensor distribution = softmax(scores); + const tensor output = dot_product_tensors(distribution, value, std::vector({2, 1}), false); + return output_dense_.apply({output}); } std::size_t num_heads_; std::size_t key_dim_; std::size_t value_dim_; std::vector attention_axes_; - std::vector weights_; - std::vector biases_; + dense_layer query_dense_; + dense_layer value_dense_; + dense_layer key_dense_; + dense_layer output_dense_; }; } } // namespace fdeep, namespace internal From 53f2d9ba9e90da49b7e3b3765fb067a1eb7cf474 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Fri, 29 Dec 2023 16:20:27 +0100 Subject: [PATCH 12/31] only allow the usual shapes --- keras_export/convert_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/keras_export/convert_model.py b/keras_export/convert_model.py index 3327a3ce..91bf7073 100755 --- a/keras_export/convert_model.py +++ b/keras_export/convert_model.py @@ -563,6 +563,8 @@ def show_additive_attention_layer(layer): def show_multi_head_attention_layer(layer): """Serialize MultiHeadAttention layer to dict""" + assert len(layer.input_shape) == 3 + assert layer.input_shape[0] is None assert layer._output_shape is None return { 'weight_shapes': list(map(lambda w: list(w.shape), layer.weights)), From 4184515a5a93126217c83432efb4efb418fc2305 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Fri, 29 Dec 2023 16:20:42 +0100 Subject: [PATCH 13/31] adjust tests --- keras_export/generate_test_models.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/keras_export/generate_test_models.py b/keras_export/generate_test_models.py index 0d6c35d4..809168f6 100644 --- a/keras_export/generate_test_models.py +++ b/keras_export/generate_test_models.py @@ -436,15 +436,25 @@ def get_test_model_exhaustive(): outputs.append(AdditiveAttention(use_scale=True)([inputs[49], inputs[50]])) outputs.append(AdditiveAttention(use_scale=True)([inputs[49], inputs[50], inputs[51]])) + outputs.append(MultiHeadAttention( + num_heads=1, key_dim=1, value_dim=None, + use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) outputs.append(MultiHeadAttention( num_heads=1, key_dim=1, value_dim=None, use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) outputs.append(MultiHeadAttention( - num_heads=3, key_dim=2, value_dim=None, - use_bias=False, output_shape=None, attention_axes=(2, 3))(inputs[2], inputs[3])) + num_heads=3, key_dim=1, value_dim=None, + use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) + # todo: re-enable + #outputs.append(MultiHeadAttention( + # num_heads=1, key_dim=2, value_dim=None, + # use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) + #outputs.append(MultiHeadAttention( + # num_heads=1, key_dim=1, value_dim=2, + # use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) outputs.append(MultiHeadAttention( - num_heads=3, key_dim=2, value_dim=1, - use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) + num_heads=1, key_dim=1, value_dim=None, + use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) shared_conv = Conv2D(1, (1, 1), padding='valid', name='shared_conv', activation='relu') From 291e127054d97e211cf43976787e59f3c1b2aa1b Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sat, 30 Dec 2023 18:10:09 +0100 Subject: [PATCH 14/31] Fix default bias and loading order of key and value weights --- include/fdeep/layers/multi_head_attention_layer.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index e7c37492..ef0167b3 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -25,8 +25,8 @@ class multi_head_attention_layer : public layer : layer(name), num_heads_(num_heads), key_dim_(key_dim), value_dim_(value_dim), attention_axes_(attention_axes), query_dense_(create_dense_layer(weights_and_biases, use_bias, 0, name + "_query_dense")), - value_dense_(create_dense_layer(weights_and_biases, use_bias, 1, name + "_value_dense")), - key_dense_(create_dense_layer(weights_and_biases, use_bias, 2, name + "_key_dense")), + value_dense_(create_dense_layer(weights_and_biases, use_bias, 2, name + "_value_dense")), + key_dense_(create_dense_layer(weights_and_biases, use_bias, 1, name + "_key_dense")), output_dense_(create_dense_layer(weights_and_biases, use_bias, 3, name + "_output_dense")) { } @@ -40,7 +40,7 @@ class multi_head_attention_layer : public layer const std::size_t n = weights.shape().width_ * weights.shape().depth_; const tensor biases = use_bias ? weights_and_biases[index_factor * index + 1] : - tensor(tensor_shape(n), 1); + tensor(tensor_shape(n), 0); return dense_layer(name, n, *weights.as_vector(), *biases.as_vector()); } tensors extract_biases(const tensors& saved_weights, bool use_bias) From 5b3fbd22829cde2e0964b7357c0237e91216ea58 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sat, 30 Dec 2023 18:49:12 +0100 Subject: [PATCH 15/31] fix shapes, add tests --- .../layers/multi_head_attention_layer.hpp | 21 +++++++++--------- keras_export/generate_test_models.py | 22 ++++++++++++------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index ef0167b3..aa530d67 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -37,7 +37,7 @@ class multi_head_attention_layer : public layer { const std::size_t index_factor = use_bias ? 2 : 1; const tensor weights = weights_and_biases[index_factor * index]; - const std::size_t n = weights.shape().width_ * weights.shape().depth_; + const std::size_t n = weights.shape().depth_; const tensor biases = use_bias ? weights_and_biases[index_factor * index + 1] : tensor(tensor_shape(n), 0); @@ -54,18 +54,19 @@ class multi_head_attention_layer : public layer const tensor query_raw = input[0]; const tensor value_raw = input[1]; const tensor key_raw = input.size() > 2 ? input[2] : value_raw; - const tensor query = query_dense_.apply({query_raw}).front(); - const tensor value = value_dense_.apply({value_raw}).front(); - const tensor key = key_dense_.apply({key_raw}).front(); assertion( - query.shape().rank() == 2 && - value.shape().rank() == 2 && - key.shape().rank() == 2 && - query.shape().depth_ == value.shape().depth_ && - query.shape().depth_ == key.shape().depth_ && - value.shape().width_ == key.shape().width_, + query_raw.shape().rank() == 2 && + value_raw.shape().rank() == 2 && + key_raw.shape().rank() == 2 && + query_raw.shape().depth_ == value_raw.shape().depth_ && + query_raw.shape().depth_ == key_raw.shape().depth_ && + value_raw.shape().width_ == key_raw.shape().width_, "Invalid shapes; need a query tensor of shape (B, T, dim) and a value/key tensor of shape (B, S, dim)." ); + const tensor query = query_dense_.apply({query_raw}).front(); + const tensor value = value_dense_.apply({value_raw}).front(); + const tensor key = key_dense_.apply({key_raw}).front(); + // https://towardsdatascience.com/transformers-explained-visually-part-3-multi-head-attention-deep-dive-1c1ff1024853 // https://dmol.pub/dl/attention.html#multi-head-attention-block // https://github.com/keras-team/keras/blob/v2.14.0/keras/layers/attention/multi_head_attention.py diff --git a/keras_export/generate_test_models.py b/keras_export/generate_test_models.py index 809168f6..1b6b4c23 100644 --- a/keras_export/generate_test_models.py +++ b/keras_export/generate_test_models.py @@ -443,18 +443,24 @@ def get_test_model_exhaustive(): num_heads=1, key_dim=1, value_dim=None, use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) outputs.append(MultiHeadAttention( - num_heads=3, key_dim=1, value_dim=None, + num_heads=1, key_dim=2, value_dim=None, use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) + outputs.append(MultiHeadAttention( + num_heads=1, key_dim=2, value_dim=None, + use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) + outputs.append(MultiHeadAttention( + num_heads=1, key_dim=1, value_dim=2, + use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) + outputs.append(MultiHeadAttention( + num_heads=1, key_dim=1, value_dim=2, + use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) # todo: re-enable - #outputs.append(MultiHeadAttention( - # num_heads=1, key_dim=2, value_dim=None, + # outputs.append(MultiHeadAttention( + # num_heads=3, key_dim=1, value_dim=None, # use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) #outputs.append(MultiHeadAttention( - # num_heads=1, key_dim=1, value_dim=2, - # use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) - outputs.append(MultiHeadAttention( - num_heads=1, key_dim=1, value_dim=None, - use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) + # num_heads=1, key_dim=1, value_dim=None, + # use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) shared_conv = Conv2D(1, (1, 1), padding='valid', name='shared_conv', activation='relu') From 0e172675076fba9e7c6bff7be7fbb8ca3c033265 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 08:21:13 +0100 Subject: [PATCH 16/31] decode weights into multiple heads --- .../layers/multi_head_attention_layer.hpp | 73 +++++++++++++------ 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index aa530d67..74dc992a 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -24,36 +24,52 @@ class multi_head_attention_layer : public layer const std::vector& weights_and_biases) : layer(name), num_heads_(num_heads), key_dim_(key_dim), value_dim_(value_dim), attention_axes_(attention_axes), - query_dense_(create_dense_layer(weights_and_biases, use_bias, 0, name + "_query_dense")), - value_dense_(create_dense_layer(weights_and_biases, use_bias, 2, name + "_value_dense")), - key_dense_(create_dense_layer(weights_and_biases, use_bias, 1, name + "_key_dense")), - output_dense_(create_dense_layer(weights_and_biases, use_bias, 3, name + "_output_dense")) + query_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 0, name + "_query_dense")), + value_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 2, name + "_value_dense")), + key_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 1, name + "_key_dense")), + output_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 3, name + "_output_dense")) { } private: - dense_layer create_dense_layer( - const tensors& weights_and_biases, bool use_bias, - std::size_t index, const std::string& name) + std::vector create_dense_layers( + const tensors& weights_and_biases, bool use_bias, const std::size_t num_heads, + const std::size_t index, const std::string& name) { const std::size_t index_factor = use_bias ? 2 : 1; const tensor weights = weights_and_biases[index_factor * index]; - const std::size_t n = weights.shape().depth_; + const std::size_t units = weights.shape().depth_; const tensor biases = use_bias ? weights_and_biases[index_factor * index + 1] : - tensor(tensor_shape(n), 0); - return dense_layer(name, n, *weights.as_vector(), *biases.as_vector()); + tensor(index == 3 ? tensor_shape(num_heads, 1, units) : tensor_shape(num_heads, units), 0); + const auto weights_per_head = + index == 3 ? tensor_to_tensors_height_slices(weights) : tensor_to_tensors_width_slices(weights); + const auto biases_per_head = + index == 3 ? tensor_to_tensors_height_slices(biases) : tensor_to_tensors_width_slices(biases); + assertion(weights_per_head.size() == num_heads, "Invalid weights for number of heads."); + assertion(biases_per_head.size() == num_heads, "Invalid biases for number of heads."); + const std::vector dense_layers = + fplus::transform( + [&](const std::pair>& n_and_w_with_b) + { + return dense_layer( + name + "_" + std::to_string(n_and_w_with_b.first), + units, + *n_and_w_with_b.second.first.as_vector(), + *n_and_w_with_b.second.second.as_vector()); + }, + fplus::enumerate(fplus::zip(weights_per_head, biases_per_head))); + return dense_layers; } tensors extract_biases(const tensors& saved_weights, bool use_bias) { return use_bias ? fplus::unweave(saved_weights).second : tensors(); } -protected: - tensors apply_impl(const tensors& input) const override + tensor apply_head( + const tensor& query_raw, + const tensor& value_raw, + const tensor& key_raw, + std::size_t head_index) const { - assertion(input.size() == 2 || input.size() == 3, "Invalid number of inputs for MultiHeadAttention layer."); - const tensor query_raw = input[0]; - const tensor value_raw = input[1]; - const tensor key_raw = input.size() > 2 ? input[2] : value_raw; assertion( query_raw.shape().rank() == 2 && value_raw.shape().rank() == 2 && @@ -63,9 +79,9 @@ class multi_head_attention_layer : public layer value_raw.shape().width_ == key_raw.shape().width_, "Invalid shapes; need a query tensor of shape (B, T, dim) and a value/key tensor of shape (B, S, dim)." ); - const tensor query = query_dense_.apply({query_raw}).front(); - const tensor value = value_dense_.apply({value_raw}).front(); - const tensor key = key_dense_.apply({key_raw}).front(); + const tensor query = query_dense_[head_index].apply({query_raw}).front(); + const tensor value = value_dense_[head_index].apply({value_raw}).front(); + const tensor key = key_dense_[head_index].apply({key_raw}).front(); // https://towardsdatascience.com/transformers-explained-visually-part-3-multi-head-attention-deep-dive-1c1ff1024853 // https://dmol.pub/dl/attention.html#multi-head-attention-block @@ -74,16 +90,25 @@ class multi_head_attention_layer : public layer const tensor scores = dot_product_tensors(query, transpose(key), std::vector({2, 1}), false); const tensor distribution = softmax(scores); const tensor output = dot_product_tensors(distribution, value, std::vector({2, 1}), false); - return output_dense_.apply({output}); + return output_dense_[head_index].apply({output}).front(); // todo + } +protected: + tensors apply_impl(const tensors& input) const override + { + assertion(input.size() == 2 || input.size() == 3, "Invalid number of inputs for MultiHeadAttention layer."); + const tensor query_raw = input[0]; + const tensor value_raw = input[1]; + const tensor key_raw = input.size() > 2 ? input[2] : value_raw; + return {apply_head(query_raw, value_raw, key_raw, 0)}; // todo: all } std::size_t num_heads_; std::size_t key_dim_; std::size_t value_dim_; std::vector attention_axes_; - dense_layer query_dense_; - dense_layer value_dense_; - dense_layer key_dense_; - dense_layer output_dense_; + std::vector query_dense_; + std::vector value_dense_; + std::vector key_dense_; + std::vector output_dense_; }; } } // namespace fdeep, namespace internal From d896c4c3e1998e3dbf95bdabe3e122b6498225dd Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 08:21:20 +0100 Subject: [PATCH 17/31] enable tests --- keras_export/generate_test_models.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/keras_export/generate_test_models.py b/keras_export/generate_test_models.py index 1b6b4c23..37bb3886 100644 --- a/keras_export/generate_test_models.py +++ b/keras_export/generate_test_models.py @@ -454,13 +454,12 @@ def get_test_model_exhaustive(): outputs.append(MultiHeadAttention( num_heads=1, key_dim=1, value_dim=2, use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) - # todo: re-enable - # outputs.append(MultiHeadAttention( - # num_heads=3, key_dim=1, value_dim=None, - # use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) - #outputs.append(MultiHeadAttention( - # num_heads=1, key_dim=1, value_dim=None, - # use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) + outputs.append(MultiHeadAttention( + num_heads=3, key_dim=1, value_dim=None, + use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) + outputs.append(MultiHeadAttention( + num_heads=1, key_dim=1, value_dim=None, + use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) shared_conv = Conv2D(1, (1, 1), padding='valid', name='shared_conv', activation='relu') From 3176f89314185794fa8122fd0de8e68cc6a06091 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 09:06:39 +0100 Subject: [PATCH 18/31] create dense output layer separately --- .../layers/multi_head_attention_layer.hpp | 52 ++++++++++++++----- keras_export/generate_test_models.py | 3 ++ 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 74dc992a..bd293fd4 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -27,7 +27,7 @@ class multi_head_attention_layer : public layer query_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 0, name + "_query_dense")), value_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 2, name + "_value_dense")), key_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 1, name + "_key_dense")), - output_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 3, name + "_output_dense")) + output_dense_(create_output_dense_layer(weights_and_biases, use_bias, name + "_output_dense")) { } private: @@ -35,16 +35,22 @@ class multi_head_attention_layer : public layer const tensors& weights_and_biases, bool use_bias, const std::size_t num_heads, const std::size_t index, const std::string& name) { + assertion(index <= 2, "Invalid dense layer index."); + const std::size_t index_factor = use_bias ? 2 : 1; - const tensor weights = weights_and_biases[index_factor * index]; + + tensor weights = weights_and_biases[index_factor * index]; + if (index == 3) + weights = permute_tensor(weights, {3, 1, 2}); + const std::size_t units = weights.shape().depth_; - const tensor biases = use_bias ? + + tensor biases = use_bias ? weights_and_biases[index_factor * index + 1] : - tensor(index == 3 ? tensor_shape(num_heads, 1, units) : tensor_shape(num_heads, units), 0); - const auto weights_per_head = - index == 3 ? tensor_to_tensors_height_slices(weights) : tensor_to_tensors_width_slices(weights); - const auto biases_per_head = - index == 3 ? tensor_to_tensors_height_slices(biases) : tensor_to_tensors_width_slices(biases); + tensor(index == 3 ? tensor_shape(units) : tensor_shape(num_heads, units), 0); + + const auto weights_per_head = tensor_to_tensors_width_slices(weights); + const auto biases_per_head = tensor_to_tensors_width_slices(biases); assertion(weights_per_head.size() == num_heads, "Invalid weights for number of heads."); assertion(biases_per_head.size() == num_heads, "Invalid biases for number of heads."); const std::vector dense_layers = @@ -60,6 +66,23 @@ class multi_head_attention_layer : public layer fplus::enumerate(fplus::zip(weights_per_head, biases_per_head))); return dense_layers; } + dense_layer create_output_dense_layer( + const tensors& weights_and_biases, bool use_bias, const std::string& name) + { + const std::size_t index_factor = use_bias ? 2 : 1; + + tensor weights = weights_and_biases[index_factor * 3]; + + const std::size_t units = weights.shape().depth_; + + tensor biases = use_bias ? + weights_and_biases[index_factor * 3 + 1] : + tensor(tensor_shape(units), 0); + + const auto weights_per_head = tensor_to_tensors_width_slices(weights); + const auto biases_per_head = tensor_to_tensors_width_slices(biases); + return dense_layer(name + "_output", units, *weights.as_vector(), *biases.as_vector()); + } tensors extract_biases(const tensors& saved_weights, bool use_bias) { return use_bias ? fplus::unweave(saved_weights).second : tensors(); @@ -89,8 +112,7 @@ class multi_head_attention_layer : public layer // https://gist.github.com/sevagh/b71d253a347a9b59c026580625452fc5 const tensor scores = dot_product_tensors(query, transpose(key), std::vector({2, 1}), false); const tensor distribution = softmax(scores); - const tensor output = dot_product_tensors(distribution, value, std::vector({2, 1}), false); - return output_dense_[head_index].apply({output}).front(); // todo + return dot_product_tensors(distribution, value, std::vector({2, 1}), false); } protected: tensors apply_impl(const tensors& input) const override @@ -99,16 +121,22 @@ class multi_head_attention_layer : public layer const tensor query_raw = input[0]; const tensor value_raw = input[1]; const tensor key_raw = input.size() > 2 ? input[2] : value_raw; - return {apply_head(query_raw, value_raw, key_raw, 0)}; // todo: all + const auto outputs = fplus::transform([&](const std::size_t head_idx) + { + return apply_head(query_raw, value_raw, key_raw, head_idx); + }, fplus::numbers(0, num_heads_)); + const tensor merged = concatenate_tensors_depth(outputs); + return output_dense_.apply({merged}); } std::size_t num_heads_; std::size_t key_dim_; std::size_t value_dim_; std::vector attention_axes_; + // todo: store each head as a separate object? std::vector query_dense_; std::vector value_dense_; std::vector key_dense_; - std::vector output_dense_; + dense_layer output_dense_; }; } } // namespace fdeep, namespace internal diff --git a/keras_export/generate_test_models.py b/keras_export/generate_test_models.py index 37bb3886..018a2124 100644 --- a/keras_export/generate_test_models.py +++ b/keras_export/generate_test_models.py @@ -457,6 +457,9 @@ def get_test_model_exhaustive(): outputs.append(MultiHeadAttention( num_heads=3, key_dim=1, value_dim=None, use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) + outputs.append(MultiHeadAttention( + num_heads=3, key_dim=1, value_dim=None, + use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50])) outputs.append(MultiHeadAttention( num_heads=1, key_dim=1, value_dim=None, use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) From c7adc7c7ff15285896e64f82e95f5fa2e3b7aca6 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 12:17:55 +0100 Subject: [PATCH 19/31] clean up --- include/fdeep/layers/multi_head_attention_layer.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index bd293fd4..8d2cdde1 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -40,14 +40,12 @@ class multi_head_attention_layer : public layer const std::size_t index_factor = use_bias ? 2 : 1; tensor weights = weights_and_biases[index_factor * index]; - if (index == 3) - weights = permute_tensor(weights, {3, 1, 2}); const std::size_t units = weights.shape().depth_; tensor biases = use_bias ? weights_and_biases[index_factor * index + 1] : - tensor(index == 3 ? tensor_shape(units) : tensor_shape(num_heads, units), 0); + tensor(tensor_shape(num_heads, units), 0); const auto weights_per_head = tensor_to_tensors_width_slices(weights); const auto biases_per_head = tensor_to_tensors_width_slices(biases); @@ -79,8 +77,6 @@ class multi_head_attention_layer : public layer weights_and_biases[index_factor * 3 + 1] : tensor(tensor_shape(units), 0); - const auto weights_per_head = tensor_to_tensors_width_slices(weights); - const auto biases_per_head = tensor_to_tensors_width_slices(biases); return dense_layer(name + "_output", units, *weights.as_vector(), *biases.as_vector()); } tensors extract_biases(const tensors& saved_weights, bool use_bias) From db62540cba18fcde87ed5465226d46ea39e2e183 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 12:18:04 +0100 Subject: [PATCH 20/31] add more tests --- keras_export/generate_test_models.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/keras_export/generate_test_models.py b/keras_export/generate_test_models.py index 018a2124..d8eb14c3 100644 --- a/keras_export/generate_test_models.py +++ b/keras_export/generate_test_models.py @@ -463,6 +463,12 @@ def get_test_model_exhaustive(): outputs.append(MultiHeadAttention( num_heads=1, key_dim=1, value_dim=None, use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) + outputs.append(MultiHeadAttention( + num_heads=2, key_dim=3, value_dim=5, + use_bias=False, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) + outputs.append(MultiHeadAttention( + num_heads=2, key_dim=3, value_dim=5, + use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) shared_conv = Conv2D(1, (1, 1), padding='valid', name='shared_conv', activation='relu') From 702cb60b3e0d7f2dfcf35cf5e648af084c97a328 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 12:31:13 +0100 Subject: [PATCH 21/31] shorten --- .../layers/multi_head_attention_layer.hpp | 30 +++++++------------ 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 8d2cdde1..95782683 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -36,13 +36,9 @@ class multi_head_attention_layer : public layer const std::size_t index, const std::string& name) { assertion(index <= 2, "Invalid dense layer index."); - const std::size_t index_factor = use_bias ? 2 : 1; - tensor weights = weights_and_biases[index_factor * index]; - const std::size_t units = weights.shape().depth_; - tensor biases = use_bias ? weights_and_biases[index_factor * index + 1] : tensor(tensor_shape(num_heads, units), 0); @@ -51,32 +47,26 @@ class multi_head_attention_layer : public layer const auto biases_per_head = tensor_to_tensors_width_slices(biases); assertion(weights_per_head.size() == num_heads, "Invalid weights for number of heads."); assertion(biases_per_head.size() == num_heads, "Invalid biases for number of heads."); - const std::vector dense_layers = - fplus::transform( - [&](const std::pair>& n_and_w_with_b) - { - return dense_layer( - name + "_" + std::to_string(n_and_w_with_b.first), - units, - *n_and_w_with_b.second.first.as_vector(), - *n_and_w_with_b.second.second.as_vector()); - }, - fplus::enumerate(fplus::zip(weights_per_head, biases_per_head))); - return dense_layers; + return fplus::transform( + [&](const std::pair>& n_and_w_with_b) + { + return dense_layer( + name + "_" + std::to_string(n_and_w_with_b.first), + units, + *n_and_w_with_b.second.first.as_vector(), + *n_and_w_with_b.second.second.as_vector()); + }, + fplus::enumerate(fplus::zip(weights_per_head, biases_per_head))); } dense_layer create_output_dense_layer( const tensors& weights_and_biases, bool use_bias, const std::string& name) { const std::size_t index_factor = use_bias ? 2 : 1; - tensor weights = weights_and_biases[index_factor * 3]; - const std::size_t units = weights.shape().depth_; - tensor biases = use_bias ? weights_and_biases[index_factor * 3 + 1] : tensor(tensor_shape(units), 0); - return dense_layer(name + "_output", units, *weights.as_vector(), *biases.as_vector()); } tensors extract_biases(const tensors& saved_weights, bool use_bias) From b825b5772b5f389ccddd4e05740bf3b4a79108e2 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 12:53:37 +0100 Subject: [PATCH 22/31] const --- include/fdeep/layers/multi_head_attention_layer.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 95782683..33a46b10 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -37,9 +37,9 @@ class multi_head_attention_layer : public layer { assertion(index <= 2, "Invalid dense layer index."); const std::size_t index_factor = use_bias ? 2 : 1; - tensor weights = weights_and_biases[index_factor * index]; + const tensor weights = weights_and_biases[index_factor * index]; const std::size_t units = weights.shape().depth_; - tensor biases = use_bias ? + const tensor biases = use_bias ? weights_and_biases[index_factor * index + 1] : tensor(tensor_shape(num_heads, units), 0); @@ -62,9 +62,9 @@ class multi_head_attention_layer : public layer const tensors& weights_and_biases, bool use_bias, const std::string& name) { const std::size_t index_factor = use_bias ? 2 : 1; - tensor weights = weights_and_biases[index_factor * 3]; + const tensor weights = weights_and_biases[index_factor * 3]; const std::size_t units = weights.shape().depth_; - tensor biases = use_bias ? + const tensor biases = use_bias ? weights_and_biases[index_factor * 3 + 1] : tensor(tensor_shape(units), 0); return dense_layer(name + "_output", units, *weights.as_vector(), *biases.as_vector()); From 67264eab88225f6282a11c95c127bcb0d0debab3 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 13:27:40 +0100 Subject: [PATCH 23/31] teeeests --- keras_export/generate_test_models.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/keras_export/generate_test_models.py b/keras_export/generate_test_models.py index d8eb14c3..c7863c02 100644 --- a/keras_export/generate_test_models.py +++ b/keras_export/generate_test_models.py @@ -149,6 +149,9 @@ def get_test_model_exhaustive(): (5, 4), (7, 4), # 50 (7, 4), + (1, 1), # todo: remove + (2, 1), # todo: remove + (1, 2), # todo: remove ] inputs = [Input(shape=s) for s in input_shapes] @@ -469,6 +472,15 @@ def get_test_model_exhaustive(): outputs.append(MultiHeadAttention( num_heads=2, key_dim=3, value_dim=5, use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) + outputs.append(MultiHeadAttention( # todo: remove + num_heads=1, key_dim=2, value_dim=None, + use_bias=True, output_shape=None, attention_axes=None)(inputs[52], inputs[52], inputs[52])) + outputs.append(MultiHeadAttention( # todo: remove + num_heads=1, key_dim=2, value_dim=None, + use_bias=True, output_shape=None, attention_axes=None)(inputs[53], inputs[53], inputs[53])) + outputs.append(MultiHeadAttention( # todo: remove + num_heads=1, key_dim=2, value_dim=None, + use_bias=True, output_shape=None, attention_axes=None)(inputs[54], inputs[54], inputs[54])) shared_conv = Conv2D(1, (1, 1), padding='valid', name='shared_conv', activation='relu') From 8f630a8b5eadba457dfa3a1ae7d1c3bf63b4f386 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 13:49:57 +0100 Subject: [PATCH 24/31] fix the distribution calculation by dividing with the square root of query_size before applying the softmax --- include/fdeep/layers/multi_head_attention_layer.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 33a46b10..54ed6ebf 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -97,7 +97,8 @@ class multi_head_attention_layer : public layer // https://github.com/keras-team/keras/blob/v2.14.0/keras/layers/attention/multi_head_attention.py // https://gist.github.com/sevagh/b71d253a347a9b59c026580625452fc5 const tensor scores = dot_product_tensors(query, transpose(key), std::vector({2, 1}), false); - const tensor distribution = softmax(scores); + const std::size_t query_size = query.shape().depth_; + const tensor distribution = softmax(transform_tensor(fplus::multiply_with(1 / std::sqrt(query_size)), scores)); return dot_product_tensors(distribution, value, std::vector({2, 1}), false); } protected: From 0c6cc0a95bd502f50bad318271b56f23d5e950a1 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 13:50:31 +0100 Subject: [PATCH 25/31] remove debug tests --- keras_export/generate_test_models.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/keras_export/generate_test_models.py b/keras_export/generate_test_models.py index c7863c02..d8eb14c3 100644 --- a/keras_export/generate_test_models.py +++ b/keras_export/generate_test_models.py @@ -149,9 +149,6 @@ def get_test_model_exhaustive(): (5, 4), (7, 4), # 50 (7, 4), - (1, 1), # todo: remove - (2, 1), # todo: remove - (1, 2), # todo: remove ] inputs = [Input(shape=s) for s in input_shapes] @@ -472,15 +469,6 @@ def get_test_model_exhaustive(): outputs.append(MultiHeadAttention( num_heads=2, key_dim=3, value_dim=5, use_bias=True, output_shape=None, attention_axes=None)(inputs[49], inputs[50], inputs[51])) - outputs.append(MultiHeadAttention( # todo: remove - num_heads=1, key_dim=2, value_dim=None, - use_bias=True, output_shape=None, attention_axes=None)(inputs[52], inputs[52], inputs[52])) - outputs.append(MultiHeadAttention( # todo: remove - num_heads=1, key_dim=2, value_dim=None, - use_bias=True, output_shape=None, attention_axes=None)(inputs[53], inputs[53], inputs[53])) - outputs.append(MultiHeadAttention( # todo: remove - num_heads=1, key_dim=2, value_dim=None, - use_bias=True, output_shape=None, attention_axes=None)(inputs[54], inputs[54], inputs[54])) shared_conv = Conv2D(1, (1, 1), padding='valid', name='shared_conv', activation='relu') From 0d2be86849a6b32650465450df76852ea451d8c9 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 13:55:07 +0100 Subject: [PATCH 26/31] remove todo comment --- include/fdeep/layers/multi_head_attention_layer.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 54ed6ebf..6633c561 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -119,7 +119,6 @@ class multi_head_attention_layer : public layer std::size_t key_dim_; std::size_t value_dim_; std::vector attention_axes_; - // todo: store each head as a separate object? std::vector query_dense_; std::vector value_dense_; std::vector key_dense_; From fd6e7c4ece9c236232f8b3fbb090c8c5924ad9ac Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 13:58:20 +0100 Subject: [PATCH 27/31] Check for attention_axes=None in conversion --- keras_export/convert_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/keras_export/convert_model.py b/keras_export/convert_model.py index 91bf7073..66fa9259 100755 --- a/keras_export/convert_model.py +++ b/keras_export/convert_model.py @@ -566,6 +566,7 @@ def show_multi_head_attention_layer(layer): assert len(layer.input_shape) == 3 assert layer.input_shape[0] is None assert layer._output_shape is None + assert layer._attention_axes == (1,), "MultiHeadAttention supported only with attention_axes=None" return { 'weight_shapes': list(map(lambda w: list(w.shape), layer.weights)), 'weights': list(map(lambda w: encode_floats(w.numpy()), layer.weights)), From a95abf465ee8ad9c8be2a1a4587026f47aaa625c Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 18:43:49 +0100 Subject: [PATCH 28/31] Do not pass unused attention_axes --- include/fdeep/import_model.hpp | 53 ++++++++++--------- .../layers/multi_head_attention_layer.hpp | 6 +-- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/include/fdeep/import_model.hpp b/include/fdeep/import_model.hpp index 8f0347d4..02f37f80 100644 --- a/include/fdeep/import_model.hpp +++ b/include/fdeep/import_model.hpp @@ -1077,8 +1077,6 @@ inline layer_ptr create_multi_head_attention_layer( const std::size_t key_dim = data["config"]["key_dim"]; const std::size_t value_dim = data["config"]["value_dim"]; const bool use_bias = data["config"]["use_bias"]; - const auto attention_axes = create_vector(create_size_t, - data["config"]["attention_axes"]); const auto weight_shapes = create_vector>(fplus::bind_1st_of_2( create_vector, create_size_t), @@ -1092,7 +1090,7 @@ inline layer_ptr create_multi_head_attention_layer( fplus::convert_container(values)); }, weight_shapes, weight_values); return std::make_shared(name, - num_heads, key_dim, value_dim, use_bias, attention_axes, weights_and_biases); + num_heads, key_dim, value_dim, use_bias, weights_and_biases); } inline std::string get_activation_type(const nlohmann::json& data) @@ -1507,38 +1505,45 @@ inline void check_test_outputs(float_type epsilon, assertion(output.shape() == target.shape(), "Wrong output size. Is " + show_tensor_shape(output.shape()) + ", should be " + show_tensor_shape(target.shape()) + "."); - for (std::size_t pos_dim_5 = 0; pos_dim_5 < output.shape().size_dim_5_; ++pos_dim_5) + try { - for (std::size_t pos_dim_4 = 0; pos_dim_4 < output.shape().size_dim_4_; ++pos_dim_4) + for (std::size_t pos_dim_5 = 0; pos_dim_5 < output.shape().size_dim_5_; ++pos_dim_5) { - for (std::size_t y = 0; y < output.shape().height_; ++y) + for (std::size_t pos_dim_4 = 0; pos_dim_4 < output.shape().size_dim_4_; ++pos_dim_4) { - for (std::size_t x = 0; x < output.shape().width_; ++x) + for (std::size_t y = 0; y < output.shape().height_; ++y) { - for (std::size_t z = 0; z < output.shape().depth_; ++z) + for (std::size_t x = 0; x < output.shape().width_; ++x) { - const tensor_pos pos(pos_dim_5, pos_dim_4, y, x, z); - const auto target_val = target.get_ignore_rank(pos); - const auto output_val = output.get_ignore_rank(pos); - if (!fplus::is_in_closed_interval_around(epsilon, - target_val, output_val) && - !(std::isnan(target_val) && std::isnan(output_val))) + for (std::size_t z = 0; z < output.shape().depth_; ++z) { - const std::string msg = - std::string("test failed: ") + - "output=" + fplus::show(i) + " " + - "pos=" + - fplus::show(y) + "," + - fplus::show(x) + "," + - fplus::show(z) + " " + - "value=" + fplus::show(output_val) + " " - "target=" + fplus::show(target_val); - internal::raise_error(msg); + const tensor_pos pos(pos_dim_5, pos_dim_4, y, x, z); + const auto target_val = target.get_ignore_rank(pos); + const auto output_val = output.get_ignore_rank(pos); + if (!fplus::is_in_closed_interval_around(epsilon, + target_val, output_val) && + !(std::isnan(target_val) && std::isnan(output_val))) + { + const std::string msg = + std::string("test failed: ") + + "output=" + fplus::show(i) + " " + + "pos=" + + fplus::show(y) + "," + + fplus::show(x) + "," + + fplus::show(z) + " " + + "value=" + fplus::show(output_val) + " " + "target=" + fplus::show(target_val); + std::cout << msg << std::endl; + //internal::raise_error(msg); + } } } } } } + } catch (const std::runtime_error& ex) + { + std::cout << ex.what() << std::endl; } } } diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 6633c561..8ee7e8f5 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -20,10 +20,9 @@ class multi_head_attention_layer : public layer public: explicit multi_head_attention_layer(const std::string& name, std::size_t num_heads, std::size_t key_dim, std::size_t value_dim, - bool use_bias, const std::vector& attention_axes, - const std::vector& weights_and_biases) + bool use_bias, const std::vector& weights_and_biases) : layer(name), num_heads_(num_heads), key_dim_(key_dim), - value_dim_(value_dim), attention_axes_(attention_axes), + value_dim_(value_dim), query_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 0, name + "_query_dense")), value_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 2, name + "_value_dense")), key_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 1, name + "_key_dense")), @@ -118,7 +117,6 @@ class multi_head_attention_layer : public layer std::size_t num_heads_; std::size_t key_dim_; std::size_t value_dim_; - std::vector attention_axes_; std::vector query_dense_; std::vector value_dense_; std::vector key_dense_; From 41ac53ace48b4bd0291fce4fe7242a0afa7b518e Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 18:52:41 +0100 Subject: [PATCH 29/31] double-check weights shapes --- include/fdeep/layers/multi_head_attention_layer.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index 8ee7e8f5..32ec0a16 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -23,25 +23,27 @@ class multi_head_attention_layer : public layer bool use_bias, const std::vector& weights_and_biases) : layer(name), num_heads_(num_heads), key_dim_(key_dim), value_dim_(value_dim), - query_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 0, name + "_query_dense")), - value_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 2, name + "_value_dense")), - key_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 1, name + "_key_dense")), + query_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 0, key_dim, name + "_query_dense")), + value_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 2, value_dim, name + "_value_dense")), + key_dense_(create_dense_layers(weights_and_biases, use_bias, num_heads, 1, key_dim, name + "_key_dense")), output_dense_(create_output_dense_layer(weights_and_biases, use_bias, name + "_output_dense")) { } private: std::vector create_dense_layers( const tensors& weights_and_biases, bool use_bias, const std::size_t num_heads, - const std::size_t index, const std::string& name) + const std::size_t index, const std::size_t units, const std::string& name) { assertion(index <= 2, "Invalid dense layer index."); const std::size_t index_factor = use_bias ? 2 : 1; const tensor weights = weights_and_biases[index_factor * index]; - const std::size_t units = weights.shape().depth_; const tensor biases = use_bias ? weights_and_biases[index_factor * index + 1] : tensor(tensor_shape(num_heads, units), 0); + assertion(weights.shape().depth_ == units, "Invalid weights shape for attention head dimension."); + assertion(biases.shape().depth_ == units, "Invalid biases shape for attention head dimension."); + const auto weights_per_head = tensor_to_tensors_width_slices(weights); const auto biases_per_head = tensor_to_tensors_width_slices(biases); assertion(weights_per_head.size() == num_heads, "Invalid weights for number of heads."); From b35deb9c8c71f1a5e4d9daec323a3b51ecc66752 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 18:54:35 +0100 Subject: [PATCH 30/31] Revert debug output --- include/fdeep/import_model.hpp | 49 +++++++++++++++------------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/include/fdeep/import_model.hpp b/include/fdeep/import_model.hpp index 02f37f80..004878fb 100644 --- a/include/fdeep/import_model.hpp +++ b/include/fdeep/import_model.hpp @@ -1505,45 +1505,38 @@ inline void check_test_outputs(float_type epsilon, assertion(output.shape() == target.shape(), "Wrong output size. Is " + show_tensor_shape(output.shape()) + ", should be " + show_tensor_shape(target.shape()) + "."); - try + for (std::size_t pos_dim_5 = 0; pos_dim_5 < output.shape().size_dim_5_; ++pos_dim_5) { - for (std::size_t pos_dim_5 = 0; pos_dim_5 < output.shape().size_dim_5_; ++pos_dim_5) + for (std::size_t pos_dim_4 = 0; pos_dim_4 < output.shape().size_dim_4_; ++pos_dim_4) { - for (std::size_t pos_dim_4 = 0; pos_dim_4 < output.shape().size_dim_4_; ++pos_dim_4) + for (std::size_t y = 0; y < output.shape().height_; ++y) { - for (std::size_t y = 0; y < output.shape().height_; ++y) + for (std::size_t x = 0; x < output.shape().width_; ++x) { - for (std::size_t x = 0; x < output.shape().width_; ++x) + for (std::size_t z = 0; z < output.shape().depth_; ++z) { - for (std::size_t z = 0; z < output.shape().depth_; ++z) + const tensor_pos pos(pos_dim_5, pos_dim_4, y, x, z); + const auto target_val = target.get_ignore_rank(pos); + const auto output_val = output.get_ignore_rank(pos); + if (!fplus::is_in_closed_interval_around(epsilon, + target_val, output_val) && + !(std::isnan(target_val) && std::isnan(output_val))) { - const tensor_pos pos(pos_dim_5, pos_dim_4, y, x, z); - const auto target_val = target.get_ignore_rank(pos); - const auto output_val = output.get_ignore_rank(pos); - if (!fplus::is_in_closed_interval_around(epsilon, - target_val, output_val) && - !(std::isnan(target_val) && std::isnan(output_val))) - { - const std::string msg = - std::string("test failed: ") + - "output=" + fplus::show(i) + " " + - "pos=" + - fplus::show(y) + "," + - fplus::show(x) + "," + - fplus::show(z) + " " + - "value=" + fplus::show(output_val) + " " - "target=" + fplus::show(target_val); - std::cout << msg << std::endl; - //internal::raise_error(msg); - } + const std::string msg = + std::string("test failed: ") + + "output=" + fplus::show(i) + " " + + "pos=" + + fplus::show(y) + "," + + fplus::show(x) + "," + + fplus::show(z) + " " + + "value=" + fplus::show(output_val) + " " + "target=" + fplus::show(target_val); + internal::raise_error(msg); } } } } } - } catch (const std::runtime_error& ex) - { - std::cout << ex.what() << std::endl; } } } From 7a574b7c9f7dfaf8b5235b0d437962ffd0076ec2 Mon Sep 17 00:00:00 2001 From: Dobiasd Date: Sun, 31 Dec 2023 18:57:56 +0100 Subject: [PATCH 31/31] Add MultiHeadAttention layer to list of supported layers in README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fb3c503e..135d527d 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Would you like to build/train a model using Keras/Python? And would you like to * `UpSampling1D/2D`, `Resizing` * `Reshape`, `Permute`, `RepeatVector` * `Embedding`, `CategoryEncoding` -* `Attention`, `AdditiveAttention` +* `Attention`, `AdditiveAttention`, `MultiHeadAttention` ### Also supported @@ -78,7 +78,7 @@ Would you like to build/train a model using Keras/Python? And would you like to `GRUCell`, `Hashing`, `IntegerLookup`, `LocallyConnected1D`, `LocallyConnected2D`, -`LSTMCell`, `Masking`, `MultiHeadAttention`, +`LSTMCell`, `Masking`, `RepeatVector`, `RNN`, `SimpleRNN`, `SimpleRNNCell`, `StackedRNNCells`, `StringLookup`, `TextVectorization`, `ThresholdedReLU`, `Upsampling3D`, `temporal` models