Skip to content

Commit

Permalink
[CPU] Avoid unnecessary reordering for nodes with several consumers b…
Browse files Browse the repository at this point in the history
…efore output & updated FC bf16 config
  • Loading branch information
v-Golubev committed Jan 20, 2025
1 parent d757efd commit 92aa520
Show file tree
Hide file tree
Showing 10 changed files with 91 additions and 125 deletions.
11 changes: 11 additions & 0 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,17 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model>& model,
const auto parentEdge = outputNode->getParentEdgeAt(0);
const auto parent = parentEdge->getParent();
parent->setOriginalOutputPrecisionAtPort(parentEdge->getInputNum(), precToSet);
// If the parent has consumers except Output, precToSet is propagated to consumer's inputs
// to avoid precision mismatch (which leads to reorder insertion and unnecessary performance overheads)
if (parent->getChildEdges().size() > 1) {
for (size_t i = 0; i < parent->getChildEdges().size(); ++i) {
const auto childEdge = parent->getChildEdgeAt(i);
// Consumers from other parent's output shouldn't be changed
if (childEdge->getInputNum() != parentEdge->getInputNum())
continue;
childEdge->getChild()->setOriginalInputPrecisionAtPort(childEdge->getOutputNum(), precToSet);
}
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,12 @@ struct Require {
// clang-format off
static const TypeMapping dnnlFCTypeMapping {
// {src, wei, bia, dst} pt<src, wei, bias, dst>
{{_bf16, _bf16 | _f32, _any, _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())},
{{_bf16, _bf16 | _f32 | _f16, _any, _bf16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())},
{{_f16, _f16, _any, _f16 | _f32}, pt(bypass(), bypass(), use<3>(), bypass())},
// integer precision outputs are not supported for float precision inputs
{{_f32 | _bf16 | _f16, _any, _any, _i8 | _u8}, pt(bypass(), bypass(), use<0>(), use<0>())},
// compresses float weights which do not match input data precision
{{_f32, _half_float, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())},
{{_bf16, _f16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())},
{{_f16, _bf16, _any, _any | _any}, pt(bypass(), bypass(), use<0>(), use<0>())},
// quantization configuration
// int8 inner_product does not support f16 output and bias
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,7 @@ void ConvolutionLayerCPUTest::checkBiasFusing(ov::CompiledModel& execNet) const
bool foundConv = false;
for (const auto& node : execGraph->get_ops()) {
const auto& rtInfo = node->get_rt_info();
auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
auto it = rtInfo.find(paramName);
OPENVINO_ASSERT(rtInfo.end() != it);
return it->second.as<std::string>();
};

if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Convolution") {
if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Convolution") {
foundConv = true;
ASSERT_EQ(3, node->inputs().size());
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,8 @@ class GroupConvolutionLayerCPUTest : public testing::WithParamInterface<groupCon
bool foundConv = false;
for (const auto& node : execGraph->get_ops()) {
const auto& rtInfo = node->get_rt_info();
auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
auto it = rtInfo.find(paramName);
OPENVINO_ASSERT(rtInfo.end() != it);
return it->second.as<std::string>();
};

if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Convolution") {
if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Convolution") {
foundConv = true;
ASSERT_EQ(3, node->inputs().size());
break;
Expand Down Expand Up @@ -228,13 +223,8 @@ TEST_P(ExpectFallbackGroupConvolutionLayerCPUTest, CompareWithRefs) {
auto function = compiledModel.get_runtime_model();
for (const auto& node : function->get_ops()) {
const auto& rtInfo = node->get_rt_info();
auto getExecValue = [&rtInfo](const std::string& paramName) -> std::string {
auto it = rtInfo.find(paramName);
OPENVINO_ASSERT(rtInfo.end() != it);
return it->second.as<std::string>();
};
if ("Convolution" == getExecValue(ov::exec_model_info::LAYER_TYPE)) {
auto primType = getExecValue(ov::exec_model_info::IMPL_TYPE);
if ("Convolution" == getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE)) {
auto primType = getRuntimeValue(rtInfo, ov::exec_model_info::IMPL_TYPE);
ASSERT_TRUE(selectedType != primType) << "primType is unexpected: " << primType;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,8 @@ void FuseTransposeAndReorderTest::check_transpose_count(size_t expectedTranspose
ASSERT_NE(nullptr, runtime_model);
size_t actual_transpose_count = 0;
for (const auto &node : runtime_model->get_ops()) {
const auto & rtInfo = node->get_rt_info();
auto getExecValue = [&rtInfo](const std::string & paramName) -> std::string {
auto it = rtInfo.find(paramName);
OPENVINO_ASSERT(rtInfo.end() != it);
return it->second.as<std::string>();
};
if (getExecValue(ov::exec_model_info::LAYER_TYPE) == "Transpose") {
const auto& rtInfo = node->get_rt_info();
if (getRuntimeValue(rtInfo, ov::exec_model_info::LAYER_TYPE) == "Transpose") {
actual_transpose_count++;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,24 +100,11 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
std::tie(inputShapes, transpose, weiElemType, additionalConfig, cpuParams) = obj.param;

std::ostringstream result;
for (const auto& shape : inputShapes) {
result << ov::test::utils::partialShape2str({shape.first}) << "_";
}
result << "TS=";
for (const auto& shape : inputShapes) {
result << "(";
if (!shape.second.empty()) {
auto itr = shape.second.begin();
do {
result << ov::test::utils::vec2str(*itr);
} while (++itr != shape.second.end() && result << "_");
}
result << ")_";
}
for (size_t i = 0; i < inputShapes.size(); ++i)
result << "IS[" << i << "]=" << inputShapes[i] << "_";
result << "transpose_a=" << transpose.first << "_";
result << "transpose_b=" << transpose.second << "_";

result << "weiLemType=" << weiElemType << "_";
result << "weiElemType=" << weiElemType << "_";

result << "config=(";
for (const auto& configEntry : additionalConfig) {
Expand All @@ -137,25 +124,6 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
std::swap(*(shape.end() - 1), *(shape.end() - 2));
}

void check_fc_weights_precision(ElementType expectedWeiElemType) const {
auto getExecValue = [](const ov::Node::RTMap& rtInfo, const std::string& paramName) -> std::string {
auto it = rtInfo.find(paramName);
OPENVINO_ASSERT(rtInfo.end() != it);
return it->second.as<std::string>();
};

const auto execFunction = compiledModel.get_runtime_model();
ASSERT_NE(nullptr, execFunction);
for (const auto& fcNode : execFunction->get_ops()) {
if (getExecValue(fcNode->get_rt_info(), ov::exec_model_info::LAYER_TYPE) == "FullyConnected") {
const auto& constNode = fcNode->get_input_node_shared_ptr(1);
ov::element::Type expectedType(
getExecValue(constNode->get_rt_info(), ov::exec_model_info::OUTPUT_PRECISIONS));
ASSERT_EQ(expectedType, expectedWeiElemType);
}
}
}

void SetUp() override {
targetDevice = ov::test::utils::DEVICE_CPU;

Expand Down Expand Up @@ -196,23 +164,23 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec

configuration.insert(additionalConfig.begin(), additionalConfig.end());

ElementType netType = ElementType::f32;
ElementType convertOutType = ElementType::f32;
inType = outType = netType = ElementType::f32;
auto it = additionalConfig.find(ov::hint::inference_precision.name());
if (it != additionalConfig.end() && it->second.as<ov::element::Type>() == ov::element::bf16) {
convertOutType = inType = outType = netType = ElementType::bf16;
netType = ElementType::bf16;
weiConstElemType = (weiConstElemType != ElementType::f32) ? weiConstElemType : ElementType::bf16;
} else {
inType = outType = netType;
// Reorder between parameter and FullyConnected
// Note: reorder between FC and Result is not needed since FC primitive supports f32 output natively
reorderCount++;
}

std::string cpuNodeType = "FullyConnected";
selectedType = makeSelectedTypeStr(selectedType, outType);
selectedType = makeSelectedTypeStr(selectedType, netType);

ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(inType, inShapeA)};
std::shared_ptr<ov::Node> inputB = ov::test::utils::make_constant(weiConstElemType, inShapeB.get_shape());
if (weiConstElemType == ElementType::f16 || weiConstElemType == ElementType::bf16) {
inputB = std::make_shared<ov::op::v0::Convert>(inputB, convertOutType);
if (weiConstElemType != inType) {
inputB = std::make_shared<ov::op::v0::Convert>(inputB, inType);
mark_as_decompression(inputB);
}
expectedWeiConstElemType = weiConstElemType;
Expand All @@ -223,18 +191,22 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface<MatMulDec
}

virtual void check_execution_graph() {
CheckNodePrecisionsWithType(compiledModel, "FullyConnected", {netType, expectedWeiConstElemType, ov::element::undefined}, {outType});
CheckPluginRelatedResults(compiledModel, "FullyConnected");
CheckNumberOfNodesWithType(compiledModel, "FullyConnected", fullyConnectedCount);
CheckNumberOfNodesWithType(compiledModel, "Transpose", transposeCount);
CheckNumberOfNodesWithType(compiledModel, "Convert", convertCount);
CheckNumberOfNodesWithType(compiledModel, "Reorder", 0);
check_fc_weights_precision(expectedWeiConstElemType);
// Note: Convert node might be converted to Subgraph
CheckNumberOfNodesWithType(compiledModel, "Subgraph", 0);
CheckNumberOfNodesWithType(compiledModel, "Reorder", reorderCount);
}

size_t fullyConnectedCount = 1;
size_t transposeCount = 0;
size_t convertCount = 0;
size_t reorderCount = 0;
ElementType expectedWeiConstElemType = ElementType::f32;
ElementType netType = ElementType::f32;
};

TEST_P(MatMulDecompressConvertTest, CompareWithRefs) {
Expand Down Expand Up @@ -266,11 +238,9 @@ const std::vector<std::vector<InputShape>> inputShapes3D = {
{{{-1, -1, -1}, {{1, 2, 3}, {1, 5, 3}}}, {{1, 3, 4}, {{1, 3, 4}, {1, 3, 4}}}},
};

ov::AnyMap emptyConfig = {/* empty config */};

std::vector<ov::AnyMap> filter_additional_config_bf16() {
std::vector<ov::AnyMap> additionalConfig;
if (ov::with_cpu_x86_avx512_core()) {
if (ov::with_cpu_x86_bfloat16()) {
additionalConfig.push_back({{ov::hint::inference_precision(ov::element::bf16)}});
}
return additionalConfig;
Expand Down Expand Up @@ -307,7 +277,7 @@ std::vector<CPUSpecificParams> filter_specific_params_bf16() {
const auto testParams2D_FP32_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes2D),
::testing::ValuesIn(transposeParams),
::testing::Values(ElementType::f32),
::testing::Values(emptyConfig),
::testing::Values(CPUTestUtils::empty_plugin_config),
::testing::ValuesIn(filter_specific_params(true)));

INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP32,
Expand All @@ -318,7 +288,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP32,
const auto testParams2D_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes2D),
::testing::ValuesIn(transposeParams),
::testing::Values(ElementType::f16, ElementType::bf16),
::testing::Values(emptyConfig),
::testing::Values(CPUTestUtils::empty_plugin_config),
::testing::ValuesIn(filter_specific_params(false)));

INSTANTIATE_TEST_SUITE_P(smoke_FC_2D,
Expand All @@ -340,7 +310,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_BF16,
const auto testParams3D_FP32_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes3D),
::testing::ValuesIn(transposeParams),
::testing::Values(ElementType::f32),
::testing::Values(emptyConfig),
::testing::Values(CPUTestUtils::empty_plugin_config),
::testing::ValuesIn(filter_specific_params(true)));

INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_FP32,
Expand All @@ -351,7 +321,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_3D_FP32,
const auto testParams3D_smoke = ::testing::Combine(::testing::ValuesIn(inputShapes3D),
::testing::ValuesIn(transposeParams),
::testing::Values(ElementType::f16, ElementType::bf16),
::testing::Values(emptyConfig),
::testing::Values(CPUTestUtils::empty_plugin_config),
::testing::ValuesIn(filter_specific_params(false)));

INSTANTIATE_TEST_SUITE_P(smoke_FC_3D,
Expand Down Expand Up @@ -460,26 +430,26 @@ class MatMulDecompressConvertTest2 : public MatMulDecompressConvertTest {

configuration.insert(additionalConfig.begin(), additionalConfig.end());

ElementType netType = ElementType::f32;
ElementType convertOutType = ElementType::f32;
inType = outType = netType = ElementType::f32;
auto it = additionalConfig.find(ov::hint::inference_precision.name());
if (it != additionalConfig.end() && it->second.as<ov::element::Type>() == ov::element::bf16) {
convertOutType = inType = outType = netType = ElementType::bf16;
netType = ElementType::bf16;
weiConstElemType = (weiConstElemType != ElementType::f32) ? weiConstElemType : ElementType::bf16;
} else {
inType = outType = netType;
// Reorder between parameter and FullyConnected
// Note: reorder between FC and Result is not needed since FC primitive supports f32 output natively
reorderCount++;
}

std::string cpuNodeType = "FullyConnected";
selectedType = makeSelectedTypeStr(selectedType, outType);
selectedType = makeSelectedTypeStr(selectedType, netType);

ov::ParameterVector params;
for (auto&& shape : {inShapeFC0, inShapeFC1}) {
params.push_back(std::make_shared<ov::op::v0::Parameter>(inType, shape));
}
std::shared_ptr<ov::Node> inputWeights = ov::test::utils::make_constant(weiConstElemType, inShapeWeights.get_shape());
if (weiConstElemType == ElementType::f16) {
inputWeights = std::make_shared<ov::op::v0::Convert>(inputWeights, convertOutType);
if (weiConstElemType != inType) {
inputWeights = std::make_shared<ov::op::v0::Convert>(inputWeights, inType);
mark_as_decompression(inputWeights);
}
expectedWeiConstElemType = weiConstElemType;
Expand All @@ -505,7 +475,7 @@ const auto testParams2D_FP16_2_smoke =
::testing::Combine(::testing::Values(static_shapes_to_test_representation({{2, 3}, {2, 3}, {3, 4}})),
::testing::Values(std::pair<bool, bool>{false, true}),
::testing::Values(ElementType::f16),
::testing::Values(emptyConfig),
::testing::Values(CPUTestUtils::empty_plugin_config),
::testing::ValuesIn(filter_specific_params(false)));

INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP16_2,
Expand Down Expand Up @@ -644,7 +614,7 @@ const auto testParams2D_FP16_3_smoke =
::testing::Combine(::testing::Values(static_shapes_to_test_representation({{1, 16, 32}, {32, 64}})),
::testing::Values(std::pair<bool, bool>{false, false}),
::testing::Values(ElementType::f16),
::testing::Values(emptyConfig),
::testing::Values(CPUTestUtils::empty_plugin_config),
::testing::ValuesIn(filter_specific_params(false)));

INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP16_3,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,9 @@ class MatmulBrgemmInt8Test : public testing::WithParamInterface<MatmulBrgemmInt8
void check_node(std::shared_ptr<const ov::Model> function, const std::string& nodeName) {
ASSERT_NE(nullptr, function);
for (const auto &node : function->get_ops()) {
const auto & rtInfo = node->get_rt_info();
auto getExecValue = [&rtInfo](const std::string & paramName) -> std::string {
auto it = rtInfo.find(paramName);
OPENVINO_ASSERT(rtInfo.end() != it);
return it->second.as<std::string>();
};
const auto& rtInfo = node->get_rt_info();
if (node->get_friendly_name() == nodeName) {
auto primType = getExecValue(ov::exec_model_info::IMPL_TYPE);
auto primType = getRuntimeValue(rtInfo, ov::exec_model_info::IMPL_TYPE);
ASSERT_TRUE(primTypeCheck(primType)) << "primType is unexpected: " << primType << " Expected: " << selectedType;
ASSERT_EQ(node->get_output_element_type(0), outType);
ASSERT_EQ(node->get_input_element_type(0), inType);
Expand Down
Loading

0 comments on commit 92aa520

Please sign in to comment.