diff --git a/include/fdeep/layers/multi_head_attention_layer.hpp b/include/fdeep/layers/multi_head_attention_layer.hpp
index c14439b7..664401bf 100644
--- a/include/fdeep/layers/multi_head_attention_layer.hpp
+++ b/include/fdeep/layers/multi_head_attention_layer.hpp
@@ -30,6 +30,7 @@ class multi_head_attention_layer : public layer
     tensors apply_impl(const tensors& input) const override
     {
         // input.size() is 1. How shall the other tensors passed here? How is it in TF?
+        // https://stackoverflow.com/questions/77400589/what-is-the-reason-for-multiheadattention-having-a-different-call-convention-tha
         // todo: implement
         return input;
     }