Add OV GQA op and decomposition pass

Fix interleave logic in decomposition Add ONNX frontend tests
openvinotoolkit · Jan 26, 2025 · 911691b · 911691b
1 parent d5ed312
commit 911691b
Show file tree

Hide file tree

Showing 15 changed files with 1,957 additions and 261 deletions.
diff --git a/...formations/include/transformations/op_conversions/group_query_attention_decomposition.hpp b/...formations/include/transformations/op_conversions/group_query_attention_decomposition.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/group_query_attention.hpp"
+#include "openvino/pass/matcher_pass.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace pass {
+
+class TRANSFORMATIONS_API GroupQueryAttentionDecomposition;
+
+}  // namespace pass
+}  // namespace ov
+
+class ov::pass::GroupQueryAttentionDecomposition : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("GroupQueryAttentionDecomposition", "0");
+    GroupQueryAttentionDecomposition();
+    ov::OutputVector decompose(std::shared_ptr<ov::op::v15::GroupQueryAttention> node);
+};
diff --git a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp
@@ -108,6 +108,7 @@
 #include "transformations/op_conversions/eye_decomposition.hpp"
 #include "transformations/op_conversions/gelu7_downgrade.hpp"
 #include "transformations/op_conversions/group_normalization_decomposition.hpp"
+#include "transformations/op_conversions/group_query_attention_decomposition.hpp"
 #include "transformations/op_conversions/hsigmoid_decomposition.hpp"
 #include "transformations/op_conversions/hswish_decomposition.hpp"
 #include "transformations/op_conversions/log_softmax_decomposition.hpp"
@@ -156,6 +157,7 @@ bool ov::pass::CommonOptimizations::run_on_model(const std::shared_ptr<ov::Model
     REGISTER_DISABLED_PASS(manager, ConvertInterpolate1ToInterpolate4)
 
     auto decomp = manager.register_pass<GraphRewrite>();
+    ADD_MATCHER(decomp, GroupQueryAttentionDecomposition)
     ADD_MATCHER(decomp, ScaledDotProductAttentionDecomposition)
     ADD_MATCHER(decomp, Gelu7Downgrade)
     ADD_MATCHER(decomp, BidirectionalSequenceDecomposition)

diff --git a/...ransformations/src/transformations/op_conversions/group_query_attention_decomposition.cpp b/...ransformations/src/transformations/op_conversions/group_query_attention_decomposition.cpp
diff --git a/src/core/include/openvino/op/group_query_attention.hpp b/src/core/include/openvino/op/group_query_attention.hpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma once
+
+#include "openvino/op/op.hpp"
+
+namespace ov {
+namespace op {
+namespace v15 {
+
+// This is an experimental operation that is implemented in the plugins.
+class OPENVINO_API GroupQueryAttention : public Op {
+public:
+    OPENVINO_OP("GroupQueryAttention", "opset15", op::Op);
+
+    GroupQueryAttention() = default;
+    GroupQueryAttention(const ov::OutputVector& args,
+                        unsigned int num_heads,
+                        unsigned int kv_num_heads,
+                        float scale,
+                        bool do_rotary,
+                        bool rotary_interleaved);
+    void validate_and_infer_types() override;
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+
+    unsigned int get_num_heads() const {
+        return m_num_heads;
+    }
+    unsigned int get_kv_num_heads() const {
+        return m_kv_num_heads;
+    }
+    float get_scale() const {
+        return m_scale;
+    }
+    bool get_do_rotary() const {
+        return m_do_rotary;
+    }
+    bool get_rotary_interleaved() const {
+        return m_rotary_interleaved;
+    }
+
+private:
+    unsigned int m_num_heads;
+    unsigned int m_kv_num_heads;
+    float m_scale = 0;
+    bool m_do_rotary = false;
+    bool m_rotary_interleaved = false;
+};
+
+}  // namespace v15
+}  // namespace op
+}  // namespace ov
diff --git a/src/core/include/openvino/op/null.hpp b/src/core/include/openvino/op/null.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+
+namespace ov {
+namespace op {
+namespace v15 {
+
+/// \brief Represents a missing optional input or output of an ONNX node
+///
+/// Some ONNX operators have inputs or outputs that are marked as optional,
+/// which means that a referring node MAY forgo providing values for such inputs
+/// or computing these outputs.
+/// An empty string is used in place of a name of such input or output.
+///
+/// More:
+/// https://github.com/onnx/onnx/blob/master/docs/IR.md#optional-inputs-and-outputs
+class OPENVINO_API Null : public Op {
+public:
+    OPENVINO_OP("Null", "opset15", op::Op);
+    Null() {
+        set_output_size(1);
+    }
+
+    static bool is_null(const ov::Node* node) {
+        return ov::as_type<const ov::op::v15::Null>(node) != nullptr;
+    }
+
+    static bool is_null(const std::shared_ptr<ov::Node>& node) {
+        return is_null(node.get());
+    }
+
+    static bool is_null(const Output<ov::Node>& output) {
+        return is_null(output.get_node());
+    }
+
+    virtual std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override {
+        return std::make_shared<ov::op::v15::Null>();
+    }
+};
+}  // namespace v15
+}  // namespace op
+}  // namespace ov
diff --git a/src/core/include/openvino/op/ops.hpp b/src/core/include/openvino/op/ops.hpp
@@ -167,6 +167,8 @@
 #include "openvino/op/roll.hpp"
 #include "openvino/op/round.hpp"
 #include "openvino/op/scaled_dot_product_attention.hpp"
+#include "openvino/op/null.hpp"
+#include "openvino/op/group_query_attention.hpp"
 #include "openvino/op/scatter_elements_update.hpp"
 #include "openvino/op/scatter_nd_update.hpp"
 #include "openvino/op/scatter_update.hpp"

diff --git a/src/core/include/openvino/opsets/opset15_tbl.hpp b/src/core/include/openvino/opsets/opset15_tbl.hpp
@@ -234,3 +234,5 @@ _OPENVINO_OP_REG(BitwiseLeftShift, ov::op::v15)
 _OPENVINO_OP_REG(BitwiseRightShift, ov::op::v15)
 _OPENVINO_OP_REG(SliceScatter, ov::op::v15)
 _OPENVINO_OP_REG(SearchSorted, ov::op::v15)
+_OPENVINO_OP_REG(GroupQueryAttention, ov::op::v15)
+_OPENVINO_OP_REG(Null, ov::op::v15)
diff --git a/src/core/src/op/group_query_attention.cpp b/src/core/src/op/group_query_attention.cpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/op/group_query_attention.hpp"
+
+#include "itt.hpp"
+#include "openvino/op/null.hpp"
+
+using namespace std;
+namespace ov {
+namespace op {
+namespace v15 {
+
+GroupQueryAttention::GroupQueryAttention(const OutputVector& args,
+                                         unsigned int num_heads,
+                                         unsigned int kv_num_heads,
+                                         float scale,
+                                         bool do_rotary,
+                                         bool rotary_interleaved)
+    : Op(args),
+      m_num_heads(num_heads),
+      m_kv_num_heads(kv_num_heads),
+      m_scale(scale),
+      m_do_rotary(do_rotary),
+      m_rotary_interleaved(rotary_interleaved) {
+    constructor_validate_and_infer_types();
+}
+
+int64_t get_head_size(const PartialShape& input_shape, int num_heads, int kv_num_heads) {
+    return input_shape[2].get_length() / (num_heads + kv_num_heads * 2);
+}
+
+std::vector<int64_t> get_qkv_sizes(const PartialShape& input_shape, int num_heads, int kv_num_heads) {
+    int64_t per_head_size = get_head_size(input_shape, num_heads, kv_num_heads);
+    const std::vector<int64_t> qkv_sizes = {num_heads * per_head_size,
+                                            kv_num_heads * per_head_size,
+                                            kv_num_heads * per_head_size};
+    return qkv_sizes;
+}
+
+void GroupQueryAttention::validate_and_infer_types() {
+    OV_OP_SCOPE(v15_GroupQueryAttention_validate_and_infer_types);
+    PartialShape input_shape = get_input_partial_shape(0);
+    Dimension batch_size = input_shape[0];
+    Dimension sequence_len = input_shape[1];
+    Dimension head_size;
+    if (Null::is_null(input_value(1)) && Null::is_null(input_value(2))) {
+        head_size = get_head_size(input_shape, m_num_heads, m_kv_num_heads);
+    } else {
+        head_size = input_shape[2].get_length() / m_num_heads;
+    }
+    Dimension output_kv_len;
+    PartialShape kv_past_shape = get_input_partial_shape(3);
+    // FIXME: https://github.com/openvinotoolkit/openvino/pull/27648
+    if (kv_past_shape[2].is_static()) {
+        output_kv_len = kv_past_shape[2] + sequence_len;
+    } else {
+        output_kv_len = ov::Dimension();
+    }
+    auto element_type = get_input_element_type(0);
+    NODE_VALIDATION_CHECK(this,
+                          element_type == element::f32 || element_type == element::f16,
+                          "GroupQueryAttention only suuports f32 and f16");
+    set_output_type(0, element_type, PartialShape{batch_size, sequence_len, head_size * m_num_heads});
+    set_output_type(1, element_type, PartialShape{batch_size, m_kv_num_heads, output_kv_len, head_size});
+    set_output_type(2, element_type, PartialShape{batch_size, m_kv_num_heads, output_kv_len, head_size});
+}
+
+bool GroupQueryAttention::visit_attributes(AttributeVisitor& visitor) {
+    OV_OP_SCOPE(v15_GroupQueryAttention_visit_attributes);
+    visitor.on_attribute("do_rotary", m_do_rotary);
+    visitor.on_attribute("kv_num_heads", m_kv_num_heads);
+    visitor.on_attribute("num_heads", m_num_heads);
+    visitor.on_attribute("rotary_interleaved", m_rotary_interleaved);
+    visitor.on_attribute("scale", m_scale);
+    return true;
+}
+
+std::shared_ptr<ov::Node> GroupQueryAttention::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    OV_OP_SCOPE(v15_GroupQueryAttention_clone_with_new_inputs);
+    return std::make_shared<GroupQueryAttention>(new_args,
+                                                 m_num_heads,
+                                                 m_kv_num_heads,
+                                                 m_scale,
+                                                 m_do_rotary,
+                                                 m_rotary_interleaved);
+}
+
+}  // namespace v15
+}  // namespace op
+}  // namespace ov
diff --git a/src/core/tests/opset.cpp b/src/core/tests/opset.cpp
@@ -76,7 +76,7 @@ INSTANTIATE_TEST_SUITE_P(opset,
                                          OpsetTestParams{ov::get_opset12, 178},
                                          OpsetTestParams{ov::get_opset13, 186},
                                          OpsetTestParams{ov::get_opset14, 188},
-                                         OpsetTestParams{ov::get_opset15, 199},
+                                         OpsetTestParams{ov::get_opset15, 201},
                                          OpsetTestParams{ov::get_opset16, 4}),
                          OpsetTestNameGenerator{});