diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp index c14439b7..664401bf 100644 --- a/include/fdeep/layers/multi_head_attention_layer.hpp +++ b/include/fdeep/layers/multi_head_attention_layer.hpp @@ -30,6 +30,7 @@ class multi_head_attention_layer : public layer tensors apply_impl(const tensors& input) const override { // input.size() is 1. How shall the other tensors passed here? How is it in TF? + // https://stackoverflow.com/questions/77400589/what-is-the-reason-for-multiheadattention-having-a-different-call-convention-tha // todo: implement return input; }