Skip to content

Commit 68a97d3

Browse files
committed
[CPU] Avoid unnecessary reordering for nodes with several consumers before output & updated FC bf16 config
1 parent 174869c commit 68a97d3

File tree

10 files changed

+91
-125
lines changed

10 files changed

+91
-125
lines changed

src/plugins/intel_cpu/src/graph.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,17 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model>& model,
244244
const auto parentEdge = outputNode->getParentEdgeAt(0);
245245
const auto parent = parentEdge->getParent();
246246
parent->setOriginalOutputPrecisionAtPort(parentEdge->getInputNum(), precToSet);
247+
// If the parent has consumers except Output, precToSet is propagated to consumer's inputs
248+
// to avoid precision mismatch (which leads to reorder insertion and unnecessary performance overheads)
249+
if (parent->getChildEdges().size() > 1) {
250+
for (size_t i = 0; i < parent->getChildEdges().size(); ++i) {
251+
const auto childEdge = parent->getChildEdgeAt(i);
252+
// Consumers from other parent's output shouldn't be changed
253+
if (childEdge->getInputNum() != parentEdge->getInputNum())
254+
continue;
255+
childEdge->getChild()->setOriginalInputPrecisionAtPort(childEdge->getOutputNum(), precToSet);
256+
}
257+
}
247258
}
248259
}
249260
}

src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,12 @@ struct Require {
6262
// clang-format off
6363
static const TypeMapping dnnlFCTypeMapping {
6464
// {src, wei, bia, dst} pt<src, wei, bias, dst>
65-
{{_bf16, _bf16 | _f32, _any, _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())},
65+
{{_bf16, _bf16 | _f32 | _f16, _any, _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())},
6666
{{_f16, _f16, _any, _f16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())},
6767
// integer precision outputs are not supported for float precision inputs
6868
{{_f32 | _bf16 | _f16, _any, _any, _i8 | _u8}, pt(bypass(), bypass(), use<0>(), use<0>())},
6969
// compresses float weights which do not match input data precision
7070
{{_f32, _half_float, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())},
71-
{{_bf16, _f16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())},
7271
{{_f16, _bf16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())},
7372
// quantization configuration
7473
// int8 inner_product does not support f16 output and bias

src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/convolution.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,7 @@ void ConvolutionLayerCPUTest::checkBiasFusing(ov::CompiledModel& execNet) const
7777
bool foundConv = false;
7878
for (const auto& node : execGraph->get_ops()) {
7979
const auto& rtInfo = node->get_rt_info();
80-
auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
81-
auto it = rtInfo.find(paramName);
82-
OPENVINO_ASSERT(rtInfo.end() != it);
83-
return it->second.as<std::string>();
84-
};
85-
86-
if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Convolution") {
80+
if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Convolution") {
8781
foundConv = true;
8882
ASSERT_EQ(3, node->inputs().size());
8983
break;

src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -96,13 +96,8 @@ class GroupConvolutionLayerCPUTest : public testing::WithParamInterface<groupCon
9696
bool foundConv = false;
9797
for (const auto& node : execGraph->get_ops()) {
9898
const auto& rtInfo = node->get_rt_info();
99-
auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
100-
auto it = rtInfo.find(paramName);
101-
OPENVINO_ASSERT(rtInfo.end() != it);
102-
return it->second.as<std::string>();
103-
};
10499

105-
if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Convolution") {
100+
if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Convolution") {
106101
foundConv = true;
107102
ASSERT_EQ(3, node->inputs().size());
108103
break;
@@ -228,13 +223,8 @@ TEST_P(ExpectFallbackGroupConvolutionLayerCPUTest, CompareWithRefs) {
228223
auto function = compiledModel.get_runtime_model();
229224
for (const auto& node : function->get_ops()) {
230225
const auto& rtInfo = node->get_rt_info();
231-
auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
232-
auto it = rtInfo.find(paramName);
233-
OPENVINO_ASSERT(rtInfo.end() != it);
234-
return it->second.as<std::string>();
235-
};
236-
if ("Convolution" == getExecValue(ov::exec_model_info::LAYER_TYPE)) {
237-
auto primType = getExecValue(ov::exec_model_info::IMPL_TYPE);
226+
if ("Convolution" == getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE)) {
227+
auto primType = getRuntimeValue(rtInfo, ov::exec_model_info::IMPL_TYPE);
238228
ASSERT_TRUE(selectedType != primType) << "primType is unexpected: " << primType;
239229
}
240230
}

src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/fuse_transpose_reorder.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,8 @@ void FuseTransposeAndReorderTest::check_transpose_count(size_t expectedTranspose
3131
ASSERT_NE(nullptr, runtime_model);
3232
size_t actual_transpose_count = 0;
3333
for (const auto &node : runtime_model->get_ops()) {
34-
const auto & rtInfo = node->get_rt_info();
35-
auto getExecValue = [&rtInfo](const std::string & paramName) -> std::string {
36-
auto it = rtInfo.find(paramName);
37-
OPENVINO_ASSERT(rtInfo.end() != it);
38-
return it->second.as<std::string>();
39-
};
40-
if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Transpose") {
34+
const auto& rtInfo = node->get_rt_info();
35+
if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Transpose") {
4136
actual_transpose_count++;
4237
}
4338
}

src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/matmul_decompress_convert.cpp

Lines changed: 32 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -100,24 +100,11 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
100100
std::tie(inputShapes, transpose, weiElemType, additionalConfig, cpuParams) = obj.param;
101101

102102
std::ostringstream result;
103-
for (const auto& shape : inputShapes) {
104-
result << ov::test::utils::partialShape2str({shape.first}) << "_";
105-
}
106-
result << "TS=";
107-
for (const auto& shape : inputShapes) {
108-
result << "(";
109-
if (!shape.second.empty()) {
110-
auto itr = shape.second.begin();
111-
do {
112-
result << ov::test::utils::vec2str(*itr);
113-
} while (++itr != shape.second.end() && result << "_");
114-
}
115-
result << ")_";
116-
}
103+
for (size_t i = 0; i < inputShapes.size(); ++i)
104+
result << "IS[" << i << "]=" << inputShapes[i] << "_";
117105
result << "transpose_a=" << transpose.first << "_";
118106
result << "transpose_b=" << transpose.second << "_";
119-
120-
result << "weiLemType=" << weiElemType << "_";
107+
result << "weiElemType=" << weiElemType << "_";
121108

122109
result << "config=(";
123110
for (const auto& configEntry : additionalConfig) {
@@ -137,25 +124,6 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
137124
std::swap(*(shape.end() - 1), *(shape.end() - 2));
138125
}
139126

140-
void check_fc_weights_precision(ElementType expectedWeiElemType) const {
141-
auto getExecValue = [](const ov::Node::RTMap& rtInfo, const std::string& paramName) -> std::string {
142-
auto it = rtInfo.find(paramName);
143-
OPENVINO_ASSERT(rtInfo.end() != it);
144-
return it->second.as<std::string>();
145-
};
146-
147-
const auto execFunction = compiledModel.get_runtime_model();
148-
ASSERT_NE(nullptr, execFunction);
149-
for (const auto& fcNode : execFunction->get_ops()) {
150-
if (getExecValue(fcNode->get_rt_info(), ov::exec_model_info::LAYER_TYPE) == "FullyConnected") {
151-
const auto& constNode = fcNode->get_input_node_shared_ptr(1);
152-
ov::element::Type expectedType(
153-
getExecValue(constNode->get_rt_info(), ov::exec_model_info::OUTPUT_PRECISIONS));
154-
ASSERT_EQ(expectedType, expectedWeiElemType);
155-
}
156-
}
157-
}
158-
159127
void SetUp() override {
160128
targetDevice = ov::test::utils::DEVICE_CPU;
161129

@@ -196,23 +164,23 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
196164

197165
configuration.insert(additionalConfig.begin(), additionalConfig.end());
198166

199-
ElementType netType = ElementType::f32;
200-
ElementType convertOutType = ElementType::f32;
167+
inType = outType = netType = ElementType::f32;
201168
auto it = additionalConfig.find(ov::hint::inference_precision.name());
202169
if (it != additionalConfig.end() && it->second.as<ov::element::Type>() == ov::element::bf16) {
203-
convertOutType = inType = outType = netType = ElementType::bf16;
170+
netType = ElementType::bf16;
204171
weiConstElemType = (weiConstElemType != ElementType::f32) ? weiConstElemType : ElementType::bf16;
205-
} else {
206-
inType = outType = netType;
172+
// Reorder between parameter and FullyConnected
173+
// Note: reorder between FC and Result is not needed since FC primitive supports f32 output natively
174+
reorderCount++;
207175
}
208176

209177
std::string cpuNodeType = "FullyConnected";
210-
selectedType = makeSelectedTypeStr(selectedType, outType);
178+
selectedType = makeSelectedTypeStr(selectedType, netType);
211179

212180
ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(inType, inShapeA)};
213181
std::shared_ptr<ov::Node> inputB = ov::test::utils::make_constant(weiConstElemType, inShapeB.get_shape());
214-
if (weiConstElemType == ElementType::f16 || weiConstElemType == ElementType::bf16) {
215-
inputB = std::make_shared<ov::op::v0::Convert>(inputB, convertOutType);
182+
if (weiConstElemType != inType) {
183+
inputB = std::make_shared<ov::op::v0::Convert>(inputB, inType);
216184
mark_as_decompression(inputB);
217185
}
218186
expectedWeiConstElemType = weiConstElemType;
@@ -223,18 +191,22 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
223191
}
224192

225193
virtual void check_execution_graph() {
194+
CheckNodePrecisionsWithType(compiledModel, "FullyConnected", {netType, expectedWeiConstElemType, ov::element::undefined}, {outType});
226195
CheckPluginRelatedResults(compiledModel, "FullyConnected");
227196
CheckNumberOfNodesWithType(compiledModel, "FullyConnected", fullyConnectedCount);
228197
CheckNumberOfNodesWithType(compiledModel, "Transpose", transposeCount);
229198
CheckNumberOfNodesWithType(compiledModel, "Convert", convertCount);
230-
CheckNumberOfNodesWithType(compiledModel, "Reorder", 0);
231-
check_fc_weights_precision(expectedWeiConstElemType);
199+
// Note: Convert node might be converted to Subgraph
200+
CheckNumberOfNodesWithType(compiledModel, "Subgraph", 0);
201+
CheckNumberOfNodesWithType(compiledModel, "Reorder", reorderCount);
232202
}
233203

234204
size_t fullyConnectedCount = 1;
235205
size_t transposeCount = 0;
236206
size_t convertCount = 0;
207+
size_t reorderCount = 0;
237208
ElementType expectedWeiConstElemType = ElementType::f32;
209+
ElementType netType = ElementType::f32;
238210
};
239211

240212
TEST_P(MatMulDecompressConvertTest, CompareWithRefs) {
@@ -266,11 +238,9 @@ const std::vector<std::vector<InputShape>> inputShapes3D = {
266238
{{{-1, -1, -1}, {{1, 2, 3}, {1, 5, 3}}}, {{1, 3, 4}, {{1, 3, 4}, {1, 3, 4}}}},
267239
};
268240

269-
ov::AnyMap emptyConfig = {/* empty config */};
270-
271241
std::vector<ov::AnyMap> filter_additional_config_bf16() {
272242
std::vector<ov::AnyMap> additionalConfig;
273-
if (ov::with_cpu_x86_avx512_core()) {
243+
if (ov::with_cpu_x86_bfloat16()) {
274244
additionalConfig.push_back({{ov::hint::inference_precision(ov::element::bf16)}});
275245
}
276246
return additionalConfig;
@@ -307,7 +277,7 @@ std::vector<CPUSpecificParams> filter_specific_params_bf16() {
307277
const auto testParams2D_FP32_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes2D),
308278
::testing::ValuesIn(transposeParams),
309279
::testing::Values(ElementType::f32),
310-
::testing::Values(emptyConfig),
280+
::testing::Values(CPUTestUtils::empty_plugin_config),
311281
::testing::ValuesIn(filter_specific_params(true)));
312282

313283
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP32,
@@ -318,7 +288,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP32,
318288
const auto testParams2D_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes2D),
319289
::testing::ValuesIn(transposeParams),
320290
::testing::Values(ElementType::f16, ElementType::bf16),
321-
::testing::Values(emptyConfig),
291+
::testing::Values(CPUTestUtils::empty_plugin_config),
322292
::testing::ValuesIn(filter_specific_params(false)));
323293

324294
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D,
@@ -340,7 +310,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_BF16,
340310
const auto testParams3D_FP32_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes3D),
341311
::testing::ValuesIn(transposeParams),
342312
::testing::Values(ElementType::f32),
343-
::testing::Values(emptyConfig),
313+
::testing::Values(CPUTestUtils::empty_plugin_config),
344314
::testing::ValuesIn(filter_specific_params(true)));
345315

346316
INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_FP32,
@@ -351,7 +321,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_FP32,
351321
const auto testParams3D_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes3D),
352322
::testing::ValuesIn(transposeParams),
353323
::testing::Values(ElementType::f16, ElementType::bf16),
354-
::testing::Values(emptyConfig),
324+
::testing::Values(CPUTestUtils::empty_plugin_config),
355325
::testing::ValuesIn(filter_specific_params(false)));
356326

357327
INSTANTIATE_TEST_SUITE_P(smoke_FC_3D,
@@ -460,26 +430,26 @@ class MatMulDecompressConvertTest2 : public MatMulDecompressConvertTest {
460430

461431
configuration.insert(additionalConfig.begin(), additionalConfig.end());
462432

463-
ElementType netType = ElementType::f32;
464-
ElementType convertOutType = ElementType::f32;
433+
inType = outType = netType = ElementType::f32;
465434
auto it = additionalConfig.find(ov::hint::inference_precision.name());
466435
if (it != additionalConfig.end() && it->second.as<ov::element::Type>() == ov::element::bf16) {
467-
convertOutType = inType = outType = netType = ElementType::bf16;
436+
netType = ElementType::bf16;
468437
weiConstElemType = (weiConstElemType != ElementType::f32) ? weiConstElemType : ElementType::bf16;
469-
} else {
470-
inType = outType = netType;
438+
// Reorder between parameter and FullyConnected
439+
// Note: reorder between FC and Result is not needed since FC primitive supports f32 output natively
440+
reorderCount++;
471441
}
472442

473443
std::string cpuNodeType = "FullyConnected";
474-
selectedType = makeSelectedTypeStr(selectedType, outType);
444+
selectedType = makeSelectedTypeStr(selectedType, netType);
475445

476446
ov::ParameterVector params;
477447
for (auto&& shape : {inShapeFC0, inShapeFC1}) {
478448
params.push_back(std::make_shared<ov::op::v0::Parameter>(inType, shape));
479449
}
480450
std::shared_ptr<ov::Node> inputWeights = ov::test::utils::make_constant(weiConstElemType, inShapeWeights.get_shape());
481-
if (weiConstElemType == ElementType::f16) {
482-
inputWeights = std::make_shared<ov::op::v0::Convert>(inputWeights, convertOutType);
451+
if (weiConstElemType != inType) {
452+
inputWeights = std::make_shared<ov::op::v0::Convert>(inputWeights, inType);
483453
mark_as_decompression(inputWeights);
484454
}
485455
expectedWeiConstElemType = weiConstElemType;
@@ -505,7 +475,7 @@ const auto testParams2D_FP16_2_smoke =
505475
::testing::Combine(::testing::Values(static_shapes_to_test_representation({{2, 3}, {2, 3}, {3, 4}})),
506476
::testing::Values(std::pair<bool, bool>{false, true}),
507477
::testing::Values(ElementType::f16),
508-
::testing::Values(emptyConfig),
478+
::testing::Values(CPUTestUtils::empty_plugin_config),
509479
::testing::ValuesIn(filter_specific_params(false)));
510480

511481
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP16_2,
@@ -644,7 +614,7 @@ const auto testParams2D_FP16_3_smoke =
644614
::testing::Combine(::testing::Values(static_shapes_to_test_representation({{1, 16, 32}, {32, 64}})),
645615
::testing::Values(std::pair<bool, bool>{false, false}),
646616
::testing::Values(ElementType::f16),
647-
::testing::Values(emptyConfig),
617+
::testing::Values(CPUTestUtils::empty_plugin_config),
648618
::testing::ValuesIn(filter_specific_params(false)));
649619

650620
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP16_3,

src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_quantized_subgraph.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -108,14 +108,9 @@ class MatmulBrgemmInt8Test : public testing::WithParamInterface<MatmulBrgemmInt8
108108
void check_node(std::shared_ptr<const ov::Model> function, const std::string& nodeName) {
109109
ASSERT_NE(nullptr, function);
110110
for (const auto &node : function->get_ops()) {
111-
const auto & rtInfo = node->get_rt_info();
112-
auto getExecValue = [&rtInfo](const std::string & paramName) -> std::string {
113-
auto it = rtInfo.find(paramName);
114-
OPENVINO_ASSERT(rtInfo.end() != it);
115-
return it->second.as<std::string>();
116-
};
111+
const auto& rtInfo = node->get_rt_info();
117112
if (node->get_friendly_name() == nodeName) {
118-
auto primType = getExecValue(ov::exec_model_info::IMPL_TYPE);
113+
auto primType = getRuntimeValue(rtInfo, ov::exec_model_info::IMPL_TYPE);
119114
ASSERT_TRUE(primTypeCheck(primType)) << "primType is unexpected: " << primType << " Expected: " << selectedType;
120115
ASSERT_EQ(node->get_output_element_type(0), outType);
121116
ASSERT_EQ(node->get_input_element_type(0), inType);

0 commit comments

Comments
 (0)