[CPU] Avoid unnecessary reordering for nodes with several consumers before output & updated FC bf16 config

v-Golubev · v-Golubev · commit 68a97d3ce13c · 2025-01-24T13:54:12.000+01:00
diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
@@ -244,6 +244,17 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model>& model,
             const auto parentEdge = outputNode->getParentEdgeAt(0);
             const auto parent = parentEdge->getParent();
             parent->setOriginalOutputPrecisionAtPort(parentEdge->getInputNum(), precToSet);
+            // If the parent has consumers except Output, precToSet is propagated to consumer's inputs
+            // to avoid precision mismatch (which leads to reorder insertion and unnecessary performance overheads)
+            if (parent->getChildEdges().size() > 1) {
+                for (size_t i = 0; i < parent->getChildEdges().size(); ++i) {
+                    const auto childEdge = parent->getChildEdgeAt(i);
+                    // Consumers from other parent's output shouldn't be changed
+                    if (childEdge->getInputNum() != parentEdge->getInputNum())
+                        continue;
+                    childEdge->getChild()->setOriginalInputPrecisionAtPort(childEdge->getOutputNum(), precToSet);
+                }
+            }
         }
     }
 }
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -62,13 +62,12 @@ struct Require {
 // clang-format off
 static const TypeMapping dnnlFCTypeMapping {
     // {src, wei, bia, dst}                                   pt<src, wei, bias, dst>
-    {{_bf16, _bf16 | _f32, _any, _bf16 | _f32},               pt(bypass(), bypass(), use<3>(), bypass())},
+    {{_bf16, _bf16 | _f32 | _f16, _any, _bf16 | _f32},        pt(bypass(), bypass(), use<3>(), bypass())},
     {{_f16, _f16, _any, _f16 | _f32},                         pt(bypass(), bypass(), use<3>(), bypass())},
     // integer precision outputs are not supported for float precision inputs
     {{_f32 | _bf16 | _f16, _any, _any, _i8 | _u8},            pt(bypass(), bypass(), use<0>(), use<0>())},
     // compresses float weights which do not match input data precision
     {{_f32, _half_float, _any, _any | _any},                  pt(bypass(), bypass(), use<0>(), use<0>())},
-    {{_bf16, _f16, _any, _any | _any},                        pt(bypass(), bypass(), use<0>(), use<0>())},
     {{_f16, _bf16, _any, _any | _any},                        pt(bypass(), bypass(), use<0>(), use<0>())},
     // quantization configuration
     // int8 inner_product does not support f16 output and bias
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/convolution.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/convolution.cpp
@@ -77,13 +77,7 @@ void ConvolutionLayerCPUTest::checkBiasFusing(ov::CompiledModel& execNet) const
     bool foundConv = false;
     for (const auto& node : execGraph->get_ops()) {
         const auto& rtInfo = node->get_rt_info();
-        auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
-            auto it = rtInfo.find(paramName);
-            OPENVINO_ASSERT(rtInfo.end() != it);
-            return it->second.as<std::string>();
-        };
-
-        if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Convolution") {
+        if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Convolution") {
             foundConv = true;
             ASSERT_EQ(3, node->inputs().size());
             break;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp
@@ -96,13 +96,8 @@ class GroupConvolutionLayerCPUTest : public testing::WithParamInterface<groupCon
         bool foundConv = false;
         for (const auto& node : execGraph->get_ops()) {
             const auto& rtInfo = node->get_rt_info();
-            auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
-                auto it = rtInfo.find(paramName);
-                OPENVINO_ASSERT(rtInfo.end() != it);
-                return it->second.as<std::string>();
-            };
 
-            if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Convolution") {
+            if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Convolution") {
                 foundConv = true;
                 ASSERT_EQ(3, node->inputs().size());
                 break;
@@ -228,13 +223,8 @@ TEST_P(ExpectFallbackGroupConvolutionLayerCPUTest, CompareWithRefs) {
     auto function = compiledModel.get_runtime_model();
     for (const auto& node : function->get_ops()) {
         const auto& rtInfo = node->get_rt_info();
-        auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
-            auto it = rtInfo.find(paramName);
-            OPENVINO_ASSERT(rtInfo.end() != it);
-            return it->second.as<std::string>();
-        };
-        if ("Convolution" == getExecValue(ov::exec_model_info::LAYER_TYPE)) {
-            auto primType = getExecValue(ov::exec_model_info::IMPL_TYPE);
+        if ("Convolution" == getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE)) {
+            auto primType = getRuntimeValue(rtInfo, ov::exec_model_info::IMPL_TYPE);
             ASSERT_TRUE(selectedType != primType) << "primType is unexpected: " << primType;
         }
     }
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/fuse_transpose_reorder.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/fuse_transpose_reorder.cpp
@@ -31,13 +31,8 @@ void FuseTransposeAndReorderTest::check_transpose_count(size_t expectedTranspose
     ASSERT_NE(nullptr, runtime_model);
     size_t actual_transpose_count = 0;
     for (const auto &node : runtime_model->get_ops()) {
-        const auto & rtInfo = node->get_rt_info();
-        auto getExecValue = [&rtInfo](const std::string & paramName) -> std::string {
-            auto it = rtInfo.find(paramName);
-            OPENVINO_ASSERT(rtInfo.end() != it);
-            return it->second.as<std::string>();
-        };
-        if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Transpose") {
+        const auto& rtInfo = node->get_rt_info();
+        if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Transpose") {
             actual_transpose_count++;
         }
     }
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/matmul_decompress_convert.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/matmul_decompress_convert.cpp
@@ -100,24 +100,11 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
         std::tie(inputShapes, transpose, weiElemType, additionalConfig, cpuParams) = obj.param;
 
         std::ostringstream result;
-        for (const auto& shape : inputShapes) {
-            result << ov::test::utils::partialShape2str({shape.first}) << "_";
-        }
-        result << "TS=";
-        for (const auto& shape : inputShapes) {
-            result << "(";
-            if (!shape.second.empty()) {
-                auto itr = shape.second.begin();
-                do {
-                    result << ov::test::utils::vec2str(*itr);
-                } while (++itr != shape.second.end() && result << "_");
-            }
-            result << ")_";
-        }
+        for (size_t i = 0; i < inputShapes.size(); ++i)
+            result << "IS[" << i << "]=" << inputShapes[i] << "_";
         result << "transpose_a=" << transpose.first << "_";
         result << "transpose_b=" << transpose.second << "_";
-
-        result << "weiLemType=" << weiElemType << "_";
+        result << "weiElemType=" << weiElemType << "_";
 
         result << "config=(";
         for (const auto& configEntry : additionalConfig) {
@@ -137,25 +124,6 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
         std::swap(*(shape.end() - 1), *(shape.end() - 2));
     }
 
-    void check_fc_weights_precision(ElementType expectedWeiElemType) const {
-        auto getExecValue = [](const ov::Node::RTMap& rtInfo, const std::string& paramName) -> std::string {
-            auto it = rtInfo.find(paramName);
-            OPENVINO_ASSERT(rtInfo.end() != it);
-            return it->second.as<std::string>();
-        };
-
-        const auto execFunction = compiledModel.get_runtime_model();
-        ASSERT_NE(nullptr, execFunction);
-        for (const auto& fcNode : execFunction->get_ops()) {
-            if (getExecValue(fcNode->get_rt_info(), ov::exec_model_info::LAYER_TYPE) == "FullyConnected") {
-                const auto& constNode = fcNode->get_input_node_shared_ptr(1);
-                ov::element::Type expectedType(
-                    getExecValue(constNode->get_rt_info(), ov::exec_model_info::OUTPUT_PRECISIONS));
-                ASSERT_EQ(expectedType, expectedWeiElemType);
-            }
-        }
-    }
-
     void SetUp() override {
         targetDevice = ov::test::utils::DEVICE_CPU;
 
@@ -196,23 +164,23 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
 
         configuration.insert(additionalConfig.begin(), additionalConfig.end());
 
-        ElementType netType = ElementType::f32;
-        ElementType convertOutType = ElementType::f32;
+        inType = outType = netType = ElementType::f32;
         auto it = additionalConfig.find(ov::hint::inference_precision.name());
         if (it != additionalConfig.end() && it->second.as<ov::element::Type>() == ov::element::bf16) {
-            convertOutType = inType = outType = netType = ElementType::bf16;
+            netType = ElementType::bf16;
             weiConstElemType = (weiConstElemType != ElementType::f32) ? weiConstElemType : ElementType::bf16;
-        } else {
-            inType = outType = netType;
+            // Reorder between parameter and FullyConnected
+            // Note: reorder between FC and Result is not needed since FC primitive supports f32 output natively
+            reorderCount++;
         }
 
         std::string cpuNodeType = "FullyConnected";
-        selectedType = makeSelectedTypeStr(selectedType, outType);
+        selectedType = makeSelectedTypeStr(selectedType, netType);
 
         ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(inType, inShapeA)};
         std::shared_ptr<ov::Node> inputB = ov::test::utils::make_constant(weiConstElemType, inShapeB.get_shape());
-        if (weiConstElemType == ElementType::f16 || weiConstElemType == ElementType::bf16) {
-            inputB = std::make_shared<ov::op::v0::Convert>(inputB, convertOutType);
+        if (weiConstElemType != inType) {
+            inputB = std::make_shared<ov::op::v0::Convert>(inputB, inType);
             mark_as_decompression(inputB);
         }
         expectedWeiConstElemType = weiConstElemType;
@@ -223,18 +191,22 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
     }
 
     virtual void check_execution_graph() {
+        CheckNodePrecisionsWithType(compiledModel, "FullyConnected", {netType, expectedWeiConstElemType, ov::element::undefined}, {outType});
         CheckPluginRelatedResults(compiledModel, "FullyConnected");
         CheckNumberOfNodesWithType(compiledModel, "FullyConnected", fullyConnectedCount);
         CheckNumberOfNodesWithType(compiledModel, "Transpose", transposeCount);
         CheckNumberOfNodesWithType(compiledModel, "Convert", convertCount);
-        CheckNumberOfNodesWithType(compiledModel, "Reorder", 0);
-        check_fc_weights_precision(expectedWeiConstElemType);
+        // Note: Convert node might be converted to Subgraph
+        CheckNumberOfNodesWithType(compiledModel, "Subgraph", 0);
+        CheckNumberOfNodesWithType(compiledModel, "Reorder", reorderCount);
     }
 
     size_t fullyConnectedCount = 1;
     size_t transposeCount = 0;
     size_t convertCount = 0;
+    size_t reorderCount = 0;
     ElementType expectedWeiConstElemType = ElementType::f32;
+    ElementType netType = ElementType::f32;
 };
 
 TEST_P(MatMulDecompressConvertTest, CompareWithRefs) {
@@ -266,11 +238,9 @@ const std::vector<std::vector<InputShape>> inputShapes3D = {
     {{{-1, -1, -1}, {{1, 2, 3}, {1, 5, 3}}}, {{1, 3, 4}, {{1, 3, 4}, {1, 3, 4}}}},
 };
 
-ov::AnyMap emptyConfig = {/* empty config */};
-
 std::vector<ov::AnyMap> filter_additional_config_bf16() {
     std::vector<ov::AnyMap> additionalConfig;
-    if (ov::with_cpu_x86_avx512_core()) {
+    if (ov::with_cpu_x86_bfloat16()) {
         additionalConfig.push_back({{ov::hint::inference_precision(ov::element::bf16)}});
     }
     return additionalConfig;
@@ -307,7 +277,7 @@ std::vector<CPUSpecificParams> filter_specific_params_bf16() {
 const auto testParams2D_FP32_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes2D),
                                                         ::testing::ValuesIn(transposeParams),
                                                         ::testing::Values(ElementType::f32),
-                                                        ::testing::Values(emptyConfig),
+                                                        ::testing::Values(CPUTestUtils::empty_plugin_config),
                                                         ::testing::ValuesIn(filter_specific_params(true)));
 
 INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP32,
@@ -318,7 +288,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP32,
 const auto testParams2D_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes2D),
                                                    ::testing::ValuesIn(transposeParams),
                                                    ::testing::Values(ElementType::f16, ElementType::bf16),
-                                                   ::testing::Values(emptyConfig),
+                                                   ::testing::Values(CPUTestUtils::empty_plugin_config),
                                                    ::testing::ValuesIn(filter_specific_params(false)));
 
 INSTANTIATE_TEST_SUITE_P(smoke_FC_2D,
@@ -340,7 +310,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_BF16,
 const auto testParams3D_FP32_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes3D),
                                                         ::testing::ValuesIn(transposeParams),
                                                         ::testing::Values(ElementType::f32),
-                                                        ::testing::Values(emptyConfig),
+                                                        ::testing::Values(CPUTestUtils::empty_plugin_config),
                                                         ::testing::ValuesIn(filter_specific_params(true)));
 
 INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_FP32,
@@ -351,7 +321,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_FP32,
 const auto testParams3D_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes3D),
                                                         ::testing::ValuesIn(transposeParams),
                                                         ::testing::Values(ElementType::f16, ElementType::bf16),
-                                                        ::testing::Values(emptyConfig),
+                                                        ::testing::Values(CPUTestUtils::empty_plugin_config),
                                                         ::testing::ValuesIn(filter_specific_params(false)));
 
 INSTANTIATE_TEST_SUITE_P(smoke_FC_3D,
@@ -460,26 +430,26 @@ class MatMulDecompressConvertTest2 : public MatMulDecompressConvertTest {
 
         configuration.insert(additionalConfig.begin(), additionalConfig.end());
 
-        ElementType netType = ElementType::f32;
-        ElementType convertOutType = ElementType::f32;
+        inType = outType = netType = ElementType::f32;
         auto it = additionalConfig.find(ov::hint::inference_precision.name());
         if (it != additionalConfig.end() && it->second.as<ov::element::Type>() == ov::element::bf16) {
-            convertOutType = inType = outType = netType = ElementType::bf16;
+            netType = ElementType::bf16;
             weiConstElemType = (weiConstElemType != ElementType::f32) ? weiConstElemType : ElementType::bf16;
-        } else {
-            inType = outType = netType;
+            // Reorder between parameter and FullyConnected
+            // Note: reorder between FC and Result is not needed since FC primitive supports f32 output natively
+            reorderCount++;
         }
 
         std::string cpuNodeType = "FullyConnected";
-        selectedType = makeSelectedTypeStr(selectedType, outType);
+        selectedType = makeSelectedTypeStr(selectedType, netType);
 
         ov::ParameterVector params;
         for (auto&& shape : {inShapeFC0, inShapeFC1}) {
             params.push_back(std::make_shared<ov::op::v0::Parameter>(inType, shape));
         }
         std::shared_ptr<ov::Node> inputWeights = ov::test::utils::make_constant(weiConstElemType, inShapeWeights.get_shape());
-        if (weiConstElemType == ElementType::f16) {
-            inputWeights = std::make_shared<ov::op::v0::Convert>(inputWeights, convertOutType);
+        if (weiConstElemType != inType) {
+            inputWeights = std::make_shared<ov::op::v0::Convert>(inputWeights, inType);
             mark_as_decompression(inputWeights);
         }
         expectedWeiConstElemType = weiConstElemType;
@@ -505,7 +475,7 @@ const auto testParams2D_FP16_2_smoke =
     ::testing::Combine(::testing::Values(static_shapes_to_test_representation({{2, 3}, {2, 3}, {3, 4}})),
                        ::testing::Values(std::pair<bool, bool>{false, true}),
                        ::testing::Values(ElementType::f16),
-                       ::testing::Values(emptyConfig),
+                       ::testing::Values(CPUTestUtils::empty_plugin_config),
                        ::testing::ValuesIn(filter_specific_params(false)));
 
 INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP16_2,
@@ -644,7 +614,7 @@ const auto testParams2D_FP16_3_smoke =
     ::testing::Combine(::testing::Values(static_shapes_to_test_representation({{1, 16, 32}, {32, 64}})),
                        ::testing::Values(std::pair<bool, bool>{false, false}),
                        ::testing::Values(ElementType::f16),
-                       ::testing::Values(emptyConfig),
+                       ::testing::Values(CPUTestUtils::empty_plugin_config),
                        ::testing::ValuesIn(filter_specific_params(false)));
 
 INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP16_3,
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_quantized_subgraph.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_quantized_subgraph.cpp
@@ -108,14 +108,9 @@ class MatmulBrgemmInt8Test : public testing::WithParamInterface<MatmulBrgemmInt8
     void check_node(std::shared_ptr<const ov::Model> function, const std::string& nodeName) {
         ASSERT_NE(nullptr, function);
         for (const auto &node : function->get_ops()) {
-            const auto & rtInfo = node->get_rt_info();
-            auto getExecValue = [&rtInfo](const std::string & paramName) -> std::string {
-                auto it = rtInfo.find(paramName);
-                OPENVINO_ASSERT(rtInfo.end() != it);
-                return it->second.as<std::string>();
-            };
+            const auto& rtInfo = node->get_rt_info();
             if (node->get_friendly_name() == nodeName) {
-                auto primType = getExecValue(ov::exec_model_info::IMPL_TYPE);
+                auto primType = getRuntimeValue(rtInfo, ov::exec_model_info::IMPL_TYPE);
                 ASSERT_TRUE(primTypeCheck(primType)) << "primType is unexpected: " << primType << " Expected: " << selectedType;
                 ASSERT_EQ(node->get_output_element_type(0), outType);
                 ASSERT_EQ(node->get_input_element_type(0), inType);
diff --git a/src/plugins/intel_cpu/tests/functional/utils/cpu_test_utils.cpp b/src/plugins/intel_cpu/tests/functional/utils/cpu_test_utils.cpp
diff --git a/src/plugins/intel_cpu/tests/functional/utils/cpu_test_utils.hpp b/src/plugins/intel_cpu/tests/functional/utils/cpu_test_utils.hpp
diff --git a/src/plugins/intel_cpu/tests/functional/utils/fusing_test_utils.cpp b/src/plugins/intel_cpu/tests/functional/utils/fusing_test_utils.cpp

Original file line number	Diff line number	Diff line change
`@@ -244,6 +244,17 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model>& model,`
`244`	`244`	`const auto parentEdge = outputNode->getParentEdgeAt(0);`
`245`	`245`	`const auto parent = parentEdge->getParent();`
`246`	`246`	`parent->setOriginalOutputPrecisionAtPort(parentEdge->getInputNum(), precToSet);`
	`247`	`+ // If the parent has consumers except Output, precToSet is propagated to consumer's inputs`
	`248`	`+ // to avoid precision mismatch (which leads to reorder insertion and unnecessary performance overheads)`
	`249`	`+ if (parent->getChildEdges().size() > 1) {`
	`250`	`+ for (size_t i = 0; i < parent->getChildEdges().size(); ++i) {`
	`251`	`+ const auto childEdge = parent->getChildEdgeAt(i);`
	`252`	`+ // Consumers from other parent's output shouldn't be changed`
	`253`	`+ if (childEdge->getInputNum() != parentEdge->getInputNum())`
	`254`	`+ continue;`
	`255`	`+ childEdge->getChild()->setOriginalInputPrecisionAtPort(childEdge->getOutputNum(), precToSet);`
	`256`	`+ }`
	`257`	`+ }`
`247`	`258`	`}`
`248`	`259`	`}`
`249`	`260`	`}`