Skip to content

Commit 481f876

Browse files
committed
[CPU] Avoid unnecessary reordering for nodes with several consumers before output & updated FC bf16 config
1 parent 69180e2 commit 481f876

File tree

10 files changed

+90
-124
lines changed

10 files changed

+90
-124
lines changed

src/plugins/intel_cpu/src/graph.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,17 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model> &model) {
223223
const auto parentEdge = outputNode->getParentEdgeAt(0);
224224
const auto parent = parentEdge->getParent();
225225
parent->setOriginalOutputPrecisionAtPort(parentEdge->getInputNum(), precToSet);
226+
// If the parent has consumers except Output, precToSet is propagated to consumer's inputs
227+
// to avoid precision mismatch (which leads to reaorder insertion and unnecessary performance overheads)
228+
if (parent->getChildEdges().size() > 1) {
229+
for (size_t i = 0; i < parent->getChildEdges().size(); ++i) {
230+
const auto childEdge = parent->getChildEdgeAt(i);
231+
// Consumers from other parent's output shouldn't be changed
232+
if (childEdge->getInputNum() != parentEdge->getInputNum())
233+
continue;
234+
childEdge->getChild()->setOriginalInputPrecisionAtPort(childEdge->getOutputNum(), precToSet);
235+
}
236+
}
226237
}
227238
}
228239

src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,12 @@ struct Require {
5959
// clang-format off
6060
static const TypeMapping dnnlFCTypeMapping {
6161
// {src, wei, bia, dst} pt<src, wei, bias, dst>
62-
{{_bf16, _bf16 | _f32, _any, _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())},
62+
{{_bf16, _bf16 | _f32 | _f16, _any, _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())},
6363
{{_f16, _f16, _any, _f16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())},
6464
// integer precision outputs are not supported for float precision inputs
6565
{{_f32 | _bf16 | _f16, _any, _any, _i8 | _u8}, pt(bypass(), bypass(), use<0>(), use<0>())},
6666
// compresses float weights which do not match input data precision
6767
{{_f32, _half_float, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())},
68-
{{_bf16, _f16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())},
6968
{{_f16, _bf16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())},
7069
// quantization configuration
7170
// int8 inner_product does not support f16 output and bias

src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/convolution.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,7 @@ void ConvolutionLayerCPUTest::checkBiasFusing(ov::CompiledModel& execNet) const
7777
bool foundConv = false;
7878
for (const auto& node : execGraph->get_ops()) {
7979
const auto& rtInfo = node->get_rt_info();
80-
auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
81-
auto it = rtInfo.find(paramName);
82-
OPENVINO_ASSERT(rtInfo.end() != it);
83-
return it->second.as<std::string>();
84-
};
85-
86-
if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Convolution") {
80+
if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Convolution") {
8781
foundConv = true;
8882
ASSERT_EQ(3, node->inputs().size());
8983
break;

src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/group_convolution.cpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,8 @@ class GroupConvolutionLayerCPUTest : public testing::WithParamInterface<groupCon
9595
bool foundConv = false;
9696
for (const auto& node : execGraph->get_ops()) {
9797
const auto& rtInfo = node->get_rt_info();
98-
auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
99-
auto it = rtInfo.find(paramName);
100-
OPENVINO_ASSERT(rtInfo.end() != it);
101-
return it->second.as<std::string>();
102-
};
10398

104-
if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Convolution") {
99+
if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Convolution") {
105100
foundConv = true;
106101
ASSERT_EQ(3, node->inputs().size());
107102
break;
@@ -226,13 +221,8 @@ TEST_P(ExpectFallbackGroupConvolutionLayerCPUTest, CompareWithRefs) {
226221
auto function = compiledModel.get_runtime_model();
227222
for (const auto& node : function->get_ops()) {
228223
const auto& rtInfo = node->get_rt_info();
229-
auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
230-
auto it = rtInfo.find(paramName);
231-
OPENVINO_ASSERT(rtInfo.end() != it);
232-
return it->second.as<std::string>();
233-
};
234-
if ("Convolution" == getExecValue(ov::exec_model_info::LAYER_TYPE)) {
235-
auto primType = getExecValue(ov::exec_model_info::IMPL_TYPE);
224+
if ("Convolution" == getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE)) {
225+
auto primType = getRuntimeValue(rtInfo, ov::exec_model_info::IMPL_TYPE);
236226
ASSERT_TRUE(selectedType != primType) << "primType is unexpected: " << primType;
237227
}
238228
}

src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/classes/fuse_transpose_reorder.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,8 @@ void FuseTransposeAndReorderTest::check_transpose_count(size_t expectedTranspose
3131
ASSERT_NE(nullptr, runtime_model);
3232
size_t actual_transpose_count = 0;
3333
for (const auto &node : runtime_model->get_ops()) {
34-
const auto & rtInfo = node->get_rt_info();
35-
auto getExecValue = [&rtInfo](const std::string & paramName) -> std::string {
36-
auto it = rtInfo.find(paramName);
37-
OPENVINO_ASSERT(rtInfo.end() != it);
38-
return it->second.as<std::string>();
39-
};
40-
if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Transpose") {
34+
const auto& rtInfo = node->get_rt_info();
35+
if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Transpose") {
4136
actual_transpose_count++;
4237
}
4338
}

src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/matmul_decompress_convert.cpp

Lines changed: 31 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -100,24 +100,11 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
100100
std::tie(inputShapes, transpose, weiElemType, additionalConfig, cpuParams) = obj.param;
101101

102102
std::ostringstream result;
103-
for (const auto& shape : inputShapes) {
104-
result << ov::test::utils::partialShape2str({shape.first}) << "_";
105-
}
106-
result << "TS=";
107-
for (const auto& shape : inputShapes) {
108-
result << "(";
109-
if (!shape.second.empty()) {
110-
auto itr = shape.second.begin();
111-
do {
112-
result << ov::test::utils::vec2str(*itr);
113-
} while (++itr != shape.second.end() && result << "_");
114-
}
115-
result << ")_";
116-
}
103+
for (size_t i = 0; i < inputShapes.size(); ++i)
104+
result << "IS[" << i << "]=" << inputShapes[i] << "_";
117105
result << "transpose_a=" << transpose.first << "_";
118106
result << "transpose_b=" << transpose.second << "_";
119-
120-
result << "weiLemType=" << weiElemType << "_";
107+
result << "weiElemType=" << weiElemType << "_";
121108

122109
result << "config=(";
123110
for (const auto& configEntry : additionalConfig) {
@@ -137,25 +124,6 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
137124
std::swap(*(shape.end() - 1), *(shape.end() - 2));
138125
}
139126

140-
void check_fc_weights_precision(ElementType expectedWeiElemType) const {
141-
auto getExecValue = [](const ov::Node::RTMap& rtInfo, const std::string& paramName) -> std::string {
142-
auto it = rtInfo.find(paramName);
143-
OPENVINO_ASSERT(rtInfo.end() != it);
144-
return it->second.as<std::string>();
145-
};
146-
147-
const auto execFunction = compiledModel.get_runtime_model();
148-
ASSERT_NE(nullptr, execFunction);
149-
for (const auto& fcNode : execFunction->get_ops()) {
150-
if (getExecValue(fcNode->get_rt_info(), ov::exec_model_info::LAYER_TYPE) == "FullyConnected") {
151-
const auto& constNode = fcNode->get_input_node_shared_ptr(1);
152-
ov::element::Type expectedType(
153-
getExecValue(constNode->get_rt_info(), ov::exec_model_info::OUTPUT_PRECISIONS));
154-
ASSERT_EQ(expectedType, expectedWeiElemType);
155-
}
156-
}
157-
}
158-
159127
void SetUp() override {
160128
targetDevice = ov::test::utils::DEVICE_CPU;
161129

@@ -196,23 +164,23 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
196164

197165
configuration.insert(additionalConfig.begin(), additionalConfig.end());
198166

199-
ElementType netType = ElementType::f32;
200-
ElementType convertOutType = ElementType::f32;
167+
inType = outType = netType = ElementType::f32;
201168
auto it = additionalConfig.find(ov::hint::inference_precision.name());
202169
if (it != additionalConfig.end() && it->second.as<ov::element::Type>() == ov::element::bf16) {
203-
convertOutType = inType = outType = netType = ElementType::bf16;
170+
netType = ElementType::bf16;
204171
weiConstElemType = (weiConstElemType != ElementType::f32) ? weiConstElemType : ElementType::bf16;
205-
} else {
206-
inType = outType = netType;
172+
// Reorder between parameter and FullyConnected
173+
// Note: reorder between FC and Result is not needed since FC primitive supports f32 output natively
174+
reorderCount++;
207175
}
208176

209177
std::string cpuNodeType = "FullyConnected";
210-
selectedType = makeSelectedTypeStr(selectedType, outType);
178+
selectedType = makeSelectedTypeStr(selectedType, netType);
211179

212180
ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(inType, inShapeA)};
213181
std::shared_ptr<ov::Node> inputB = ov::test::utils::make_constant(weiConstElemType, inShapeB.get_shape());
214-
if (weiConstElemType == ElementType::f16 || weiConstElemType == ElementType::bf16) {
215-
inputB = std::make_shared<ov::op::v0::Convert>(inputB, convertOutType);
182+
if (weiConstElemType != inType) {
183+
inputB = std::make_shared<ov::op::v0::Convert>(inputB, inType);
216184
mark_as_decompression(inputB);
217185
}
218186
expectedWeiConstElemType = weiConstElemType;
@@ -223,17 +191,21 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
223191
}
224192

225193
void check_execution_graph() {
194+
CheckNodePrecisionsWithType(compiledModel, "FullyConnected", {netType, expectedWeiConstElemType}, {outType});
226195
CheckPluginRelatedResults(compiledModel, "FullyConnected");
227196
CheckNumberOfNodesWithType(compiledModel, "FullyConnected", fullyConnectedCount);
228197
CheckNumberOfNodesWithType(compiledModel, "Transpose", transposeCount);
229198
CheckNumberOfNodesWithType(compiledModel, "Convert", 0);
230-
CheckNumberOfNodesWithType(compiledModel, "Reorder", 0);
231-
check_fc_weights_precision(expectedWeiConstElemType);
199+
// Note: Convert node might be converted to Subgraph
200+
CheckNumberOfNodesWithType(compiledModel, "Subgraph", 0);
201+
CheckNumberOfNodesWithType(compiledModel, "Reorder", reorderCount);
232202
}
233203

234204
size_t fullyConnectedCount = 1;
235205
size_t transposeCount = 0;
206+
size_t reorderCount = 0;
236207
ElementType expectedWeiConstElemType = ElementType::f32;
208+
ElementType netType = ElementType::f32;
237209
};
238210

239211
TEST_P(MatMulDecompressConvertTest, CompareWithRefs) {
@@ -265,11 +237,9 @@ const std::vector<std::vector<InputShape>> inputShapes3D = {
265237
{{{-1, -1, -1}, {{1, 2, 3}, {1, 5, 3}}}, {{1, 3, 4}, {{1, 3, 4}, {1, 3, 4}}}},
266238
};
267239

268-
ov::AnyMap emptyConfig = {/* empty config */};
269-
270240
std::vector<ov::AnyMap> filter_additional_config_bf16() {
271241
std::vector<ov::AnyMap> additionalConfig;
272-
if (ov::with_cpu_x86_avx512_core()) {
242+
if (ov::with_cpu_x86_bfloat16()) {
273243
additionalConfig.push_back({{ov::hint::inference_precision(ov::element::bf16)}});
274244
}
275245
return additionalConfig;
@@ -306,7 +276,7 @@ std::vector<CPUSpecificParams> filter_specific_params_bf16() {
306276
const auto testParams2D_FP32_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes2D),
307277
::testing::ValuesIn(transposeParams),
308278
::testing::Values(ElementType::f32),
309-
::testing::Values(emptyConfig),
279+
::testing::Values(CPUTestUtils::empty_plugin_config),
310280
::testing::ValuesIn(filter_specific_params(true)));
311281

312282
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP32,
@@ -317,7 +287,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP32,
317287
const auto testParams2D_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes2D),
318288
::testing::ValuesIn(transposeParams),
319289
::testing::Values(ElementType::f16, ElementType::bf16),
320-
::testing::Values(emptyConfig),
290+
::testing::Values(CPUTestUtils::empty_plugin_config),
321291
::testing::ValuesIn(filter_specific_params(false)));
322292

323293
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D,
@@ -339,7 +309,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_BF16,
339309
const auto testParams3D_FP32_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes3D),
340310
::testing::ValuesIn(transposeParams),
341311
::testing::Values(ElementType::f32),
342-
::testing::Values(emptyConfig),
312+
::testing::Values(CPUTestUtils::empty_plugin_config),
343313
::testing::ValuesIn(filter_specific_params(true)));
344314

345315
INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_FP32,
@@ -350,7 +320,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_FP32,
350320
const auto testParams3D_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes3D),
351321
::testing::ValuesIn(transposeParams),
352322
::testing::Values(ElementType::f16, ElementType::bf16),
353-
::testing::Values(emptyConfig),
323+
::testing::Values(CPUTestUtils::empty_plugin_config),
354324
::testing::ValuesIn(filter_specific_params(false)));
355325

356326
INSTANTIATE_TEST_SUITE_P(smoke_FC_3D,
@@ -464,26 +434,26 @@ class MatMulDecompressConvertTest2 : public MatMulDecompressConvertTest {
464434

465435
configuration.insert(additionalConfig.begin(), additionalConfig.end());
466436

467-
ElementType netType = ElementType::f32;
468-
ElementType convertOutType = ElementType::f32;
437+
inType = outType = netType = ElementType::f32;
469438
auto it = additionalConfig.find(ov::hint::inference_precision.name());
470439
if (it != additionalConfig.end() && it->second.as<ov::element::Type>() == ov::element::bf16) {
471-
convertOutType = inType = outType = netType = ElementType::bf16;
440+
netType = ElementType::bf16;
472441
weiConstElemType = (weiConstElemType != ElementType::f32) ? weiConstElemType : ElementType::bf16;
473-
} else {
474-
inType = outType = netType;
442+
// Reorder between parameter and FullyConnected
443+
// Note: reorder between FC and Result is not needed since FC primitive supports f32 output natively
444+
reorderCount++;
475445
}
476446

477447
std::string cpuNodeType = "FullyConnected";
478-
selectedType = makeSelectedTypeStr(selectedType, outType);
448+
selectedType = makeSelectedTypeStr(selectedType, netType);
479449

480450
ov::ParameterVector params;
481451
for (auto&& shape : {inShapeFC0, inShapeFC1}) {
482452
params.push_back(std::make_shared<ov::op::v0::Parameter>(inType, shape));
483453
}
484454
std::shared_ptr<ov::Node> inputWeights = ov::test::utils::make_constant(weiConstElemType, inShapeWeights.get_shape());
485-
if (weiConstElemType == ElementType::f16) {
486-
inputWeights = std::make_shared<ov::op::v0::Convert>(inputWeights, convertOutType);
455+
if (weiConstElemType != inType) {
456+
inputWeights = std::make_shared<ov::op::v0::Convert>(inputWeights, inType);
487457
mark_as_decompression(inputWeights);
488458
}
489459
expectedWeiConstElemType = weiConstElemType;
@@ -509,7 +479,7 @@ const auto testParams2D_FP16_2_smoke =
509479
::testing::Combine(::testing::Values(static_shapes_to_test_representation({{2, 3}, {2, 3}, {3, 4}})),
510480
::testing::Values(std::pair<bool, bool>{false, true}),
511481
::testing::Values(ElementType::f16),
512-
::testing::Values(emptyConfig),
482+
::testing::Values(CPUTestUtils::empty_plugin_config),
513483
::testing::ValuesIn(filter_specific_params(false)));
514484

515485
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP16_2,

src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/matmul_quantized_subgraph.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -108,14 +108,9 @@ class MatmulBrgemmInt8Test : public testing::WithParamInterface<MatmulBrgemmInt8
108108
void check_node(std::shared_ptr<const ov::Model> function, const std::string& nodeName) {
109109
ASSERT_NE(nullptr, function);
110110
for (const auto &node : function->get_ops()) {
111-
const auto & rtInfo = node->get_rt_info();
112-
auto getExecValue = [&rtInfo](const std::string & paramName) -> std::string {
113-
auto it = rtInfo.find(paramName);
114-
OPENVINO_ASSERT(rtInfo.end() != it);
115-
return it->second.as<std::string>();
116-
};
111+
const auto& rtInfo = node->get_rt_info();
117112
if (node->get_friendly_name() == nodeName) {
118-
auto primType = getExecValue(ov::exec_model_info::IMPL_TYPE);
113+
auto primType = getRuntimeValue(rtInfo, ov::exec_model_info::IMPL_TYPE);
119114
ASSERT_TRUE(primTypeCheck(primType)) << "primType is unexpected: " << primType << " Expected: " << selectedType;
120115
ASSERT_EQ(node->get_output_element_type(0), outType);
121116
ASSERT_EQ(node->get_input_element_type(0), inType);

0 commit comments

Comments
 (0)