Add MultiHeadAttention (#3650)

Add support for MultiHeadAttention operator. Description: https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.MultiHeadAttentio
ROCm · Jan 15, 2025 · d5c9f7a · d5c9f7a
1 parent d86646f
commit d5c9f7a
Show file tree

Hide file tree

Showing 30 changed files with 1,523 additions and 0 deletions.
diff --git a/src/onnx/parse_multi_head_attention.cpp b/src/onnx/parse_multi_head_attention.cpp
@@ -0,0 +1,295 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/errors.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/ranges.hpp>
+#include <string>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+enum class qkv_fomat_t
+{
+    q_k_v       = 0,
+    q_k_v_cross = 1,
+    kv_packed   = 2,
+    qkv_packed  = 3
+};
+
+struct multi_head_attention_parameters
+{
+    int64_t batch_size;
+    int64_t q_sequence_length;
+    int64_t kv_sequence_length;
+    int64_t hidden_size;
+    int64_t hidden_size_v;
+    int64_t head_size;
+    int64_t head_size_v;
+    qkv_fomat_t qkv_fomat;
+};
+
+struct parse_multi_head_attention : op_parser<parse_multi_head_attention>
+{
+
+    std::vector<op_desc> operators() const { return {{"MultiHeadAttention"}}; }
+
+    void unpack_qkv(const onnx_parser::node_info& info,
+                    instruction_ref& query,
+                    instruction_ref& key,
+                    instruction_ref& value) const
+    {
+        // (batch_size, q_sequence_length, num_heads, 3, head_size) ->
+        // (3, batch_size, q_sequence_length, num_heads, head_size)
+        auto qkv_packed =
+            info.add_instruction(make_op("transpose", {{"permutation", {3, 0, 1, 2, 4}}}), query);
+        query = info.add_instruction(
+            make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), qkv_packed);
+        query = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), query);
+        key   = info.add_instruction(
+            make_op("slice", {{"axes", {0}}, {"starts", {1}}, {"ends", {2}}}), qkv_packed);
+        key   = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), key);
+        value = info.add_instruction(
+            make_op("slice", {{"axes", {0}}, {"starts", {2}}, {"ends", {3}}}), qkv_packed);
+        value = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), value);
+    }
+
+    void unpack_kv(const onnx_parser::node_info& info,
+                   instruction_ref& key,
+                   instruction_ref& value) const
+    {
+        // (batch_size, kv_sequence_length, num_heads, 2, head_size) ->
+        // (2, batch_size, kv_sequence_length, num_heads, head_size)
+        auto kv_packed =
+            info.add_instruction(make_op("transpose", {{"permutation", {3, 0, 1, 2, 4}}}), key);
+        key = info.add_instruction(
+            make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {1}}}), kv_packed);
+        key   = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), key);
+        value = info.add_instruction(
+            make_op("slice", {{"axes", {0}}, {"starts", {1}}, {"ends", {2}}}), kv_packed);
+        value = info.add_instruction(make_op("squeeze", {{"axes", {0}}}), value);
+    }
+
+    void check_inputs(const std::vector<instruction_ref>& args,
+                      const int64_t num_heads,
+                      multi_head_attention_parameters& params) const
+    {
+        if(args.empty() or args.size() > 3)
+            MIGRAPHX_THROW("MultiHeadAttention: Wrong number of inputs. Only 'query', 'key' and "
+                           "'value' inputs are supported.");
+
+        auto query_dim  = args[0]->get_shape().ndim();
+        auto query_lens = args[0]->get_shape().lens();
+
+        params.batch_size        = query_lens[0];
+        params.q_sequence_length = query_lens[1];
+
+        if(query_dim != 3 and query_dim != 5)
+            MIGRAPHX_THROW("MultiHeadAttention: Input 'query' rank needs to be 3 or 5, current: " +
+                           std::to_string(query_dim));
+
+        if(query_dim == 5)
+        {
+            if(query_lens[2] != num_heads or query_lens[3] != 3)
+                MIGRAPHX_THROW("MultiHeadAttention: Input 'query' shape needs to be (batch_size, "
+                               "q_sequence_length, num_heads, 3, head_size) for packed input.");
+
+            params.kv_sequence_length = query_lens[1];
+            params.head_size          = query_lens[4];
+            params.head_size_v        = query_lens[4];
+            params.hidden_size        = num_heads * query_lens[4];
+            params.hidden_size_v      = num_heads * query_lens[4];
+            params.qkv_fomat          = qkv_fomat_t::qkv_packed;
+        }
+        else // query_dim == 3
+        {
+            if(args.size() < 2)
+                MIGRAPHX_THROW("MultiHeadAttention: Wrong number of inputs, 'key' is missing.");
+
+            params.hidden_size = query_lens[2];
+            params.head_size   = query_lens[2] / num_heads;
+
+            auto key_dim  = args[1]->get_shape().ndim();
+            auto key_lens = args[1]->get_shape().lens();
+
+            if(key_dim < 3 or key_dim > 5)
+                MIGRAPHX_THROW(
+                    "MultiHeadAttention: Input 'key' rank needs to be 3, 4 or 5, current: " +
+                    std::to_string(key_dim));
+
+            if(key_dim == 5)
+            {
+                if(key_lens[0] != params.batch_size or key_lens[2] != num_heads or
+                   key_lens[3] != 2 or key_lens[4] != params.head_size)
+                    MIGRAPHX_THROW("MultiHeadAttention: Input 'key' shape needs to be (batch_size, "
+                                   "kv_sequence_length, num_heads, 2, head_size)");
+
+                params.kv_sequence_length = key_lens[1];
+                params.hidden_size_v      = params.hidden_size;
+                params.head_size_v        = key_lens[4];
+                params.qkv_fomat          = qkv_fomat_t::kv_packed;
+            }
+            else
+            {
+                if(args.size() < 3)
+                    MIGRAPHX_THROW(
+                        "MultiHeadAttention: Wrong number of inputs, 'value' is missing.");
+
+                auto value_dim  = args[2]->get_shape().ndim();
+                auto value_lens = args[2]->get_shape().lens();
+
+                if(key_dim != value_dim)
+                    MIGRAPHX_THROW(
+                        "MultiHeadAttention: Input 'key' and 'value' rank needs to be equal.");
+
+                if(key_dim == 3)
+                {
+                    if(key_lens[0] != params.batch_size or key_lens[2] != params.hidden_size)
+                        MIGRAPHX_THROW("MultiHeadAttention: Input 'key' shape needs to be "
+                                       "(batch_size, kv_sequence_length, hidden_size)");
+
+                    if(value_lens[0] != params.batch_size or value_lens[1] != key_lens[1])
+                        MIGRAPHX_THROW("MultiHeadAttention: Input 'value' shape needs to be "
+                                       "(batch_size, kv_sequence_length, hidden_size_v)");
+
+                    params.kv_sequence_length = key_lens[1];
+                    params.hidden_size_v      = value_lens[2];
+                    params.head_size_v        = value_lens[2] / num_heads;
+                    params.qkv_fomat          = qkv_fomat_t::q_k_v;
+                }
+                else // key_dim == 4
+                {
+                    if(key_lens[0] != params.batch_size or key_lens[1] != num_heads or
+                       key_lens[3] != params.head_size)
+                        MIGRAPHX_THROW("MultiHeadAttention: Input 'key' shape needs to be "
+                                       "(batch_size, num_heads, kv_sequence_length, head_size)");
+
+                    if(value_lens[0] != params.batch_size or value_lens[1] != num_heads or
+                       value_lens[2] != key_lens[2])
+                        MIGRAPHX_THROW("MultiHeadAttention: Input 'value' shape needs to be "
+                                       "(batch_size, num_heads, kv_sequence_length, head_size_v)");
+
+                    params.kv_sequence_length = key_lens[2];
+                    params.hidden_size_v      = value_lens[3] * num_heads;
+                    params.head_size_v        = value_lens[3];
+                    params.qkv_fomat          = qkv_fomat_t::q_k_v_cross;
+                }
+            }
+        }
+    }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        if(not contains(info.attributes, "num_heads"))
+            MIGRAPHX_THROW("MultiHeadAttention: num_heads attribute is required");
+
+        int64_t num_heads = parser.parse_value(info.attributes.at("num_heads")).at<int>();
+
+        multi_head_attention_parameters params;
+        check_inputs(args, num_heads, params);
+
+        auto query = args[0];
+        instruction_ref key;
+        instruction_ref value;
+
+        if(params.qkv_fomat == qkv_fomat_t::qkv_packed)
+        {
+            // Packed QKV: (batch_size, q_sequence_length, num_heads, 3, head_size)
+            unpack_qkv(info, query, key, value);
+        }
+        else
+        {
+            // Query: (batch_size, q_sequence_length, hidden_size)
+            std::vector<int64_t> q_dims{
+                params.batch_size, params.q_sequence_length, num_heads, params.head_size};
+            query = info.add_instruction(make_op("reshape", {{"dims", q_dims}}), query);
+
+            key = args[1];
+
+            if(params.qkv_fomat == qkv_fomat_t::kv_packed)
+            {
+                // Packed KV: (batch_size, kv_sequence_length, num_heads, 2, head_size)
+                unpack_kv(info, key, value);
+            }
+            else
+            {
+                value = args[2];
+                if(params.qkv_fomat == qkv_fomat_t::q_k_v)
+                {
+                    // Key: (batch_size, kv_sequence_length, hidden_size)
+                    // Value: (batch_size, kv_sequence_length, hidden_size_v)
+                    std::vector<int64_t> k_dims{
+                        params.batch_size, params.kv_sequence_length, num_heads, params.head_size};
+                    std::vector<int64_t> v_dims{params.batch_size,
+                                                params.kv_sequence_length,
+                                                num_heads,
+                                                params.head_size_v};
+                    key   = info.add_instruction(make_op("reshape", {{"dims", k_dims}}), key);
+                    value = info.add_instruction(make_op("reshape", {{"dims", v_dims}}), value);
+                }
+            }
+        }
+
+        // Target shape: (batch_size, num_heads, sequence_length, head_size)
+        std::vector<int64_t> perm{0, 2, 1, 3};
+        query = info.add_instruction(make_op("transpose", {{"permutation", perm}}), query);
+        if(params.qkv_fomat != qkv_fomat_t::q_k_v_cross)
+        {
+            key   = info.add_instruction(make_op("transpose", {{"permutation", perm}}), key);
+            value = info.add_instruction(make_op("transpose", {{"permutation", perm}}), value);
+        }
+
+        float scale = 1 / std::sqrt(params.head_size);
+        if(contains(info.attributes, "scale"))
+            scale = parser.parse_value(info.attributes.at("scale")).at<float>();
+
+        auto scale_literal = info.add_literal(
+            migraphx::literal{migraphx::shape{query->get_shape().type()}, {scale}});
+
+        auto key_transposed =
+            info.add_instruction(make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), key);
+
+        auto result = info.add_instruction(make_op("dot"), query, key_transposed);
+        result      = info.add_common_op("mul", result, scale_literal);
+        result      = info.add_instruction(make_op("softmax", {{"axis", -1}}), result);
+        result      = info.add_instruction(make_op("dot"), result, value);
+        result      = info.add_instruction(make_op("transpose", {{"permutation", perm}}), result);
+        result      = info.add_instruction(
+            make_op(
+                "reshape",
+                {{"dims", {params.batch_size, params.q_sequence_length, params.hidden_size_v}}}),
+            result);
+
+        return result;
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx