Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU] Extend Concat node logic to avoid fallback on slow ref implementation. #4129

Merged
merged 8 commits into from
May 31, 2021
Merged
10 changes: 10 additions & 0 deletions inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,16 @@ PartialBlkDesc PartialBlkDesc::makeCBlocked(const InferenceEngine::SizeVector &d
return res;
}


PartialBlkDesc PartialBlkDesc::makeTailC(const InferenceEngine::SizeVector &dims) {
PartialBlkDesc res = makePlain(dims);
if (dims.size() > 2) {
auto itr = res.outer_order.begin() + 1;
std::rotate(itr, itr + 1, res.outer_order.end());
}
return res;
}

PartialBlkDesc PartialBlkDesc::extractFrom(const InferenceEngine::TensorDesc &desc) {
if (desc.getLayout() == InferenceEngine::ANY)
IE_THROW() << "Cannot extract partial blocked descriptor for `ANY` layout";
Expand Down
3 changes: 3 additions & 0 deletions inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ class PartialBlkDesc {
/** Construct blocked Channel PartialBlkDesc based on dims information */
static PartialBlkDesc makeCBlocked(const InferenceEngine::SizeVector &dims, size_t block_size);

/** Construct per Channel PartialBlkDesc based on dims information */
static PartialBlkDesc makeTailC(const InferenceEngine::SizeVector &dims);

/** Compare operators. Allow to use it as key for std::map */
bool operator == (const PartialBlkDesc& it) const;
bool operator < (const PartialBlkDesc& it) const;
Expand Down
476 changes: 157 additions & 319 deletions inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ class MKLDNNConcatNode : public MKLDNNNode {

private:
size_t axis = 0;
bool canOptimizeNspc = false;

size_t inverseOrder(const InferenceEngine::SizeVector& order, size_t axis);
void execNspcSpecCase();

InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::FP32;
InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,15 @@ void MKLDNNShuffleChannelsNode::initSupportedPrimitiveDescriptors() {
impl_type = impl_desc_type::ref;
}

addSupportedPrimDesc({{TensorDescCreatorTypes::nspc, precision}},
{{TensorDescCreatorTypes::nspc, precision}},
// use ncsp as default for non-quantized networks and nspc for quantized
auto firstCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::nspc : TensorDescCreatorTypes::ncsp;
auto secondCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::ncsp : TensorDescCreatorTypes::nspc;

addSupportedPrimDesc({{firstCreatorType, precision}},
{{firstCreatorType, precision}},
impl_type, supportDynamicBatch_);
addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}},
{{TensorDescCreatorTypes::ncsp, precision}},
addSupportedPrimDesc({{secondCreatorType, precision}},
{{secondCreatorType, precision}},
impl_type, supportDynamicBatch_);
// canUseBlocked
if (axis_ != 1) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "ngraph_functions/builders.hpp"
#include "test_utils/cpu_test_utils.hpp"

using namespace InferenceEngine;
using namespace CPUTestUtils;

namespace CPULayerTestsDefinitions {

typedef std::tuple<
size_t, // Concat axis
std::vector<std::vector<size_t>>, // Input shapes
InferenceEngine::Precision, // Network precision
std::string, // Device name
CPUSpecificParams
> concatCPUTestParams;

class ConcatLayerCPUTest : public testing::WithParamInterface<concatCPUTestParams>,
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
public:
static std::string getTestCaseName(testing::TestParamInfo<concatCPUTestParams> obj) {
int axis;
std::vector<std::vector<size_t>> inputShapes;
InferenceEngine::Precision netPrecision;
std::string targetName;
CPUSpecificParams cpuParams;
std::tie(axis, inputShapes, netPrecision, targetName, cpuParams) = obj.param;

std::ostringstream result;
result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
result << "axis=" << axis << "_";
result << "netPRC=" << netPrecision.name() << "_";
result << "trgDev=" << targetName << "_";
result << CPUTestsBase::getTestCaseName(cpuParams);
return result.str();
}
protected:
void SetUp() override {
int axis;
std::vector<std::vector<size_t>> inputShape;
InferenceEngine::Precision netPrecision;
CPUSpecificParams cpuParams;
std::tie(axis, inputShape, netPrecision, targetDevice, cpuParams) = this->GetParam();
inPrc = outPrc = netPrecision;

std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
selectedType += std::string("_") + inPrc.name();

auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
auto params = ngraph::builder::makeParams(ngPrc, inputShape);
auto paramOuts = ngraph::helpers::convert2OutputVector(
ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
auto concat = std::make_shared<ngraph::opset1::Concat>(paramOuts, axis);

function = makeNgraphFunction(ngPrc, params, concat, "concat");
}
};

TEST_P(ConcatLayerCPUTest, CompareWithRefs) {
SKIP_IF_CURRENT_TEST_IS_DISABLED()

Run();
CheckPluginRelatedResults(executableNetwork, "Concatenation");
}

namespace {
const auto planar_4D_ref = CPUSpecificParams{{nchw}, {nchw}, {"ref"}, "ref"};
const auto planar_5D_ref = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref"}, "ref"};

const auto planar_4D = CPUSpecificParams{{nchw}, {nchw}, {}, "unknown"};
const auto planar_5D = CPUSpecificParams{{ncdhw}, {ncdhw}, {}, "unknown"};

const auto planarChannels_4D = CPUSpecificParams{{nhwc}, {nhwc}, {}, "ref"};
const auto planarChannels_5D = CPUSpecificParams{{ndhwc}, {ndhwc}, {}, "ref"};

const auto blocked8_4D = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "unknown"};
const auto blocked8_5D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "unknown"};

const auto blocked8_4D_ref = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "ref"};
const auto blocked8_5D_ref = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "ref"};

const auto blocked16_4D = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "unknown"};
const auto blocked16_5D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "unknown"};

const auto blocked16_4D_ref = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "ref"};
const auto blocked16_5D_ref = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "ref"};

// List of precisions natively supported by mkldnn.
const std::vector<Precision> netPrecisions = {
Precision::I8,
Precision::I32,
Precision::FP32,
Precision::BF16
};

INSTANTIATE_TEST_CASE_P(concat_Concat4D_CPU_Block8inPlace, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(1),
::testing::Values(std::vector<std::vector<size_t>>{{1, 8, 3, 5},
{1, 16, 3, 5}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_4D, planarChannels_4D, blocked8_4D)),
ConcatLayerCPUTest::getTestCaseName);

INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block8, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0, 2, 3),
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5},
{2, 16, 3, 5}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_4D_ref, planarChannels_4D, blocked8_4D_ref)),
ConcatLayerCPUTest::getTestCaseName);

INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16inPlace, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(1),
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5},
{2, 32, 3, 5}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_4D)),
ConcatLayerCPUTest::getTestCaseName);

INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0, 2, 3),
::testing::Values(std::vector<std::vector<size_t>>{{2, 32, 3, 5},
{2, 32, 3, 5}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_4D_ref)),
ConcatLayerCPUTest::getTestCaseName);

INSTANTIATE_TEST_CASE_P(concat_Concat5D_CPU_Block8inPlace, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(1),
::testing::Values(std::vector<std::vector<size_t>>{{1, 8, 3, 5, 7},
{1, 16, 3, 5, 7}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_5D, planarChannels_5D, blocked8_5D)),
ConcatLayerCPUTest::getTestCaseName);

INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block8, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0, 2, 3, 4),
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5, 7},
{2, 16, 3, 5, 7}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_5D_ref, planarChannels_5D, blocked8_5D_ref)),
ConcatLayerCPUTest::getTestCaseName);

INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16inPlace, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(1),
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5, 7},
{2, 32, 3, 5, 7}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_5D)),
ConcatLayerCPUTest::getTestCaseName);

INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0, 2, 3, 4),
::testing::Values(std::vector<std::vector<size_t>>{{2, 32, 3, 5, 7},
{2, 32, 3, 5, 7}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_5D_ref)),
ConcatLayerCPUTest::getTestCaseName);


INSTANTIATE_TEST_CASE_P(smoke_Concat_inPlace, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(1),
::testing::Values(std::vector<std::vector<size_t>>{{2, 3, 5},
{2, 4, 5}},
std::vector<std::vector<size_t>>{{2, 3},
{2, 4}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})),
ConcatLayerCPUTest::getTestCaseName);

INSTANTIATE_TEST_CASE_P(smoke_Concat3D, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0, 2),
::testing::Values(std::vector<std::vector<size_t>>{{2, 4, 5},
{2, 4, 5}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})),
ConcatLayerCPUTest::getTestCaseName);

INSTANTIATE_TEST_CASE_P(smoke_Concat_1D_2D, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0),
::testing::Values(std::vector<std::vector<size_t>>{{2, 4},
{3, 4}},
std::vector<std::vector<size_t>>{{2}, {3}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})),
ConcatLayerCPUTest::getTestCaseName);

} // namespace
} // namespace CPULayerTestsDefinitions
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ void FuseTransposeAndReorderTest2::CreateGraph() {
transpose2->get_rt_info() = makeCPUInfo({memFmt2}, {memFmt2}, {});

auto concat = ngraph::builder::makeConcat({transpose1, transpose2}, 1);
concat->get_rt_info() = makeCPUInfo({memFmt1, memFmt1}, {memFmt1}, {});

ngraph::ResultVector results{std::make_shared<ngraph::opset5::Result>(concat)};
function = std::make_shared<ngraph::Function>(results, params, "Transpose_Transpose_Concat");
Expand Down