Skip to content

Commit

Permalink
Performance problems fixes. Part 1 (#49)
Browse files Browse the repository at this point in the history
  • Loading branch information
Maxim Andronov authored and dmitry-gorokhov committed May 5, 2021
1 parent bcb3c37 commit 9807690
Show file tree
Hide file tree
Showing 26 changed files with 429 additions and 455 deletions.
5 changes: 2 additions & 3 deletions inference-engine/src/mkldnn_plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ set(LAYERS
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_concat_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_conv_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_convert_node.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_crop_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_strided_slice_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_deconv_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_def_conv_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_eltwise_node.cpp
Expand Down Expand Up @@ -98,7 +98,6 @@ set(LAYERS
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/sparse_to_dense.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/bucketize.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/squeeze.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/strided_slice.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/topkrois_onnx.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/unique.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/unsqueeze.cpp
Expand All @@ -108,7 +107,7 @@ set(LAYERS
${CMAKE_CURRENT_SOURCE_DIR}/nodes/topk.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/proposal.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/proposal_imp.cpp
# ${CMAKE_CURRENT_SOURCE_DIR}/nodes/cum_sum.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/cum_sum.cpp
)

file(GLOB SOURCES
Expand Down
33 changes: 23 additions & 10 deletions inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "utils/debug_capabilities.h"
#include "utils/node_dumper.h"
#include "utils/ngraph_utils.hpp"
#include "utils/cpu_utils.hpp"

#include <ngraph/node.hpp>
#include <ngraph/function.hpp>
Expand Down Expand Up @@ -194,16 +195,6 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana

auto orderedOps = func->get_ordered_ops();


// // The input layer precision has to be equal to the InputData precision
// std::map<std::string, Precision> changedPrecision;
// for (const auto& input : inputs) {
// auto inputLayer = getCreatorLayer(input.second->getInputData()).lock();
// if (inputLayer) {
// inputLayer->precision = inputLayer->outData[0]->getTensorDesc().getPrecision();
// }
// }
//
// // TODO [NM]: unordered_map is preferred from performance perspective. Needs hash for ngraph::Node
// std::unordered_map<ngraph::Node, MKLDNNNodePtr> op2node;
std::map<std::shared_ptr<ngraph::Node>, MKLDNNNodePtr> op2node;
Expand Down Expand Up @@ -287,6 +278,28 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana
graphEdges.push_back(edge);
graphNodes.push_back(outNode);
}

// change precision for input/output nodes to avoid extra data conversion when set input/output blobs
// also we need to change input/output precisions for consumers/producers to avoid inserting reorder
for (auto &input : inputNodesMap) {
const auto precToSet = normalizeToSupportedPrecision(inputsInfo.at(input.first)->getPrecision());
input.second->setOriginalOutputPrecisionAtPort(0, precToSet);
const auto childEdges = input.second->getChildEdgesAtPort(0);
for (size_t i = 0; i < childEdges.size(); i++) {
const auto child = childEdges[i]->getChild();
child->setOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum(), precToSet);
}
}
for (auto &output : outputNodesMap) {
const auto precToSet = normalizeToSupportedPrecision(outputsInfo.at(output.first)->getPrecision());
output.second->setOriginalInputPrecisionAtPort(0, precToSet);
const auto parentEdges = output.second->getParentEdgesAtPort(0);
for (size_t i = 0; i < parentEdges.size(); i++) {
const auto parent = parentEdges[i]->getChild();
parent->setOriginalInputPrecisionAtPort(parentEdges[i]->getOutputNum(), precToSet);
}
}

//
// // Replicate input nodes
// for (const auto& input : inputs) {
Expand Down
52 changes: 52 additions & 0 deletions inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
FuseConvolutionAndZeroPoints(graph);
graph.RemoveDroppedNodes();

FuseConvolutionAndSimpleOperationThroughMaxPool(graph);
graph.RemoveDroppedNodes();

// TODO [NM]: While fusing simple operation into any node (except Eltwise) we need to check that other inputs are Constant nodes.
FuseConvolutionAndSimpleOperation(graph);
graph.RemoveDroppedNodes();
Expand Down Expand Up @@ -1022,6 +1025,55 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
// }
}

// TODO: mandrono: unite with FuseConvolutionAndSimpleOperation
void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();

auto isSutableParentNode = [](MKLDNNNodePtr node) {
return (node->getType() == Convolution || node->getType() == BinaryConvolution) && node->getChildEdges().size() == 1 &&
node->getOriginalOutputPrecisionAtPort(0) == Precision::FP32;
};

auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSutableParentNode(parentNode)) {
parent++;
continue;
}

auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (childNode->getAlgorithm() != PoolingMax || childNode->getChildEdges().size() != 1) {
parent++;
continue;
}

auto fuseCandidate = childNode->getChildEdgeAt(0)->getChild();
if (parentNode->getType() == BinaryConvolution && !parentNode->canFuse(fuseCandidate)) {
parent++;
continue;
}

if (!one_of(fuseCandidate->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseBoundedRelu, EltwiseClamp, EltwiseTanh,
EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
EltwiseRoundHalfAwayFromZero, EltwiseLinear, EltwiseAbs, EltwiseSquare, EltwiseSqrt)) {
parent++;
continue;
}
parentNode->addFusedNode(fuseCandidate);
parentNode->addOriginalLayer(fuseCandidate->getOriginalLayers());
auto parentEdges = fuseCandidate->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (p_edge->getParent() == childNode)
continue;

removeEdge(graph, p_edge);
}
graph.DropNode(fuseCandidate);
}
}

void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class MKLDNNGraphOptimizer {
void FuseMultiplyAndAdd(MKLDNNGraph &graph);
void MergeTwoEqualScaleShifts(MKLDNNGraph& graph);
void FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph);
void FuseConvolutionAndSimpleOperationThroughMaxPool(MKLDNNGraph &graph);
void FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph);
void FuseConvolutionAndDepthwise(MKLDNNGraph &graph);
void FuseConvolutionAndDWConvolution(MKLDNNGraph &graph);
Expand Down
78 changes: 28 additions & 50 deletions inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
#include "nodes/common/cpu_memcpy.h"
#include "mkldnn_async_infer_request.h"
#include <debug.h>

#include "utils/general_utils.h"
#include "utils/cpu_utils.hpp"

MKLDNNPlugin::MKLDNNInferRequest::MKLDNNInferRequest(InferenceEngine::InputsDataMap networkInputs,
InferenceEngine::OutputsDataMap networkOutputs,
Expand Down Expand Up @@ -103,33 +104,14 @@ void MKLDNNPlugin::MKLDNNInferRequest::PushInputData() {
IE_THROW() << "Input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name " << input.first;
}
auto inPrec = input.second->getTensorDesc().getPrecision();
if (graph->hasMeanImageFor(input.first) && one_of(inPrec, InferenceEngine::Precision::U8, InferenceEngine::Precision::BOOL)) {
inPrec = InferenceEngine::Precision::FP32;
} else {
inPrec = normalizeToSupportedPrecision(inPrec);
}

switch (inPrec) {
// these precisions are supported by mkldnn, so we push the blob directly
case InferenceEngine::Precision::I8:
case InferenceEngine::Precision::I32:
case InferenceEngine::Precision::BF16:
case InferenceEngine::Precision::FP32: {
break;
}
// these precisions are supported by mkldnn, so we push the blob directly
// BUT if a mean image exists, we convert the blob and send FP32
case InferenceEngine::Precision::U8:
case InferenceEngine::Precision::BOOL: {
if (graph->hasMeanImageFor(input.first))
inPrec = InferenceEngine::Precision::FP32;
break;
}
// these precisions are unsupported by mkldnn, so we convert the blob and send I32
case InferenceEngine::Precision::U16:
case InferenceEngine::Precision::I16:
case InferenceEngine::Precision::I64:
case InferenceEngine::Precision::U64: {
inPrec = InferenceEngine::Precision::I32;
break;
}
default:
IE_THROW() << "Unsupported input precision " << input.second->getTensorDesc().getPrecision();
if (inPrec == InferenceEngine::Precision::UNSPECIFIED) {
IE_THROW() << "Unsupported input precision " << input.second->getTensorDesc().getPrecision();
}

// User can initialize input via setBlob API using tensorDesc with default (ANY) layout.
Expand Down Expand Up @@ -257,11 +239,10 @@ InferenceEngine::Blob::Ptr MKLDNNPlugin::MKLDNNInferRequest::GetBlob(const std::

_inputs[name] = make_blob_with_precision(desc);
_inputs[name]->allocate();
// [NM] TODO mandrono
// if (desc.getPrecision() == originPrecision &&
// graph->_meanImages.find(name) == graph->_meanImages.end() && !graph->getProperty().batchLimit) {
// externalPtr[name] = _inputs[name]->buffer();
// }
if (desc.getPrecision() == originPrecision &&
graph->_meanImages.find(name) == graph->_meanImages.end() && !graph->getProperty().batchLimit) {
externalPtr[name] = _inputs[name]->buffer();
}
data = _inputs[name];
checkBlob(data, name, true);
return data;
Expand All @@ -287,10 +268,9 @@ InferenceEngine::Blob::Ptr MKLDNNPlugin::MKLDNNInferRequest::GetBlob(const std::

_outputs[name] = make_blob_with_precision(desc);
_outputs[name]->allocate();
// [NM] TODO mandrono
// if (desc.getPrecision() == originPrecision && !graph->getProperty().batchLimit) {
// externalPtr[name] = _outputs[name]->buffer();
// }
if (desc.getPrecision() == originPrecision && !graph->getProperty().batchLimit) {
externalPtr[name] = _outputs[name]->buffer();
}
data = _outputs[name];
checkBlob(data, name, false);
return data;
Expand Down Expand Up @@ -354,13 +334,12 @@ void MKLDNNPlugin::MKLDNNInferRequest::SetBlob(const std::string& name, const In
IE_THROW(ParameterMismatch) << "Failed to set input blob. Blocking descriptor mismatch.";
}

// [NM] TODO mandrono: if input precision == FP32 but node precision != FP32 convertion doesn't performed
// if (data->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32 &&
// graph->_meanImages.find(name) == graph->_meanImages.end() && !graph->getProperty().batchLimit) {
// externalPtr[name] = data->buffer();
// } else if (externalPtr.find(name) != externalPtr.end()) {
// externalPtr.erase(name);
// }
if (data->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32 &&
graph->_meanImages.find(name) == graph->_meanImages.end() && !graph->getProperty().batchLimit) {
externalPtr[name] = data->buffer();
} else if (externalPtr.find(name) != externalPtr.end()) {
externalPtr.erase(name);
}
_inputs[name] = data;
}
} else {
Expand All @@ -387,13 +366,12 @@ void MKLDNNPlugin::MKLDNNInferRequest::SetBlob(const std::string& name, const In
IE_THROW(ParameterMismatch) << "Failed to set output blob. Blocking descriptor mismatch.";
}

// TODO: [NM]
// if (data->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32 &&
// !graph->getProperty().batchLimit) {
// externalPtr[name] = data->buffer();
// } else if (externalPtr.find(name) != externalPtr.end()) {
// externalPtr.erase(name);
// }
if (data->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32 &&
!graph->getProperty().batchLimit) {
externalPtr[name] = data->buffer();
} else if (externalPtr.find(name) != externalPtr.end()) {
externalPtr.erase(name);
}
_outputs[name] = data;
}
}
Expand Down
33 changes: 18 additions & 15 deletions inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
{ "PRelu", Eltwise },
{ "Erf", Eltwise },
{ "Reshape", Reshape },
{ "Squeeze", Reshape },
{ "Unsqueeze", Reshape },
{ "Softmax", Softmax },
{ "Reorder", Reorder },
{ "Roll", Roll },
Expand Down Expand Up @@ -145,7 +147,7 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
// { "Eltwise", Eltwise },
// { "Mod", Eltwise },
// { "Power", Eltwise },
// { "StridedSlice", StridedSlice },
{ "StridedSlice", StridedSlice },
// { "Reshape", Reshape },
{ "Tile", Tile },
// { "SimplerNMS", SimplerNMS },
Expand All @@ -163,7 +165,7 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
// { "RNNSequence", RNNSeq },
{ "FakeQuantize", FakeQuantize },
// { "BinaryConvolution", BinaryConvolution },
// { "DeformableConvolution", DeformableConvolution },
{ "DeformableConvolution", DeformableConvolution },
// { "TensorIterator", TensorIterator },
// { "Loop", TensorIterator },
{ "ReadValue", MemoryInput}, // for construction from name ctor, arbitrary name is used
Expand Down Expand Up @@ -1284,19 +1286,20 @@ MKLDNNNode* MKLDNNNode::NodesFactory::create(const std::shared_ptr<ngraph::Node>
}
}

if (newNode == nullptr) {
try {
std::unique_ptr<MKLDNNNode> ol(new MKLDNNReferenceNode(op, eng, w_cache, errorMessage));
if (ol != nullptr && ol->created(extMgr))
newNode = ol.release();
} catch (const InferenceEngine::Exception& ex) {
if (ex.getStatus() != NOT_IMPLEMENTED) {
throw;
} else {
errorMessage += getExceptionDescWithoutStatus(ex);
}
}
}
// TODO [NM]: enable after all nodes will be migrated on ngraph
// if (newNode == nullptr) {
// try {
// std::unique_ptr<MKLDNNNode> ol(new MKLDNNReferenceNode(op, eng, w_cache, errorMessage));
// if (ol != nullptr && ol->created(extMgr))
// newNode = ol.release();
// } catch (const InferenceEngine::Exception& ex) {
// if (ex.getStatus() != NOT_IMPLEMENTED) {
// throw;
// } else {
// errorMessage += getExceptionDescWithoutStatus(ex);
// }
// }
// }

// TODO [NM]: Not implemented
// // WA-start : TI node requires all attributes to construct internal subgpath
Expand Down
7 changes: 7 additions & 0 deletions inference-engine/src/mkldnn_plugin/mkldnn_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,13 @@ class MKLDNNNode : public InferenceEngine::details::no_copy {
originalInputPrecisions[port] = precision;
}

void setOriginalOutputPrecisionAtPort(size_t port, InferenceEngine::Precision precision) {
if (originalOutputPrecisions.size() <= port) {
IE_THROW() << "Incorrect output port number for node " << getName();
}
originalOutputPrecisions[port] = precision;
}

void addOriginalInputPrecision(InferenceEngine::Precision precision) {
originalInputPrecisions.push_back(precision);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,21 @@
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/pattern/op/or.hpp>

NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ReshapeFullyConnectedFusion, "ReshapeFullyConnectedFusion", 0);

MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() {
auto m_reshape = ngraph::pattern::wrap_type<ngraph::opset1::Reshape>(ngraph::pattern::has_static_shape());
auto m_fc = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>({m_reshape, ngraph::pattern::any_input()});
ngraph::OutputVector twoInputs = {m_reshape, ngraph::pattern::any_input()};
ngraph::OutputVector threeInputs = {m_reshape, ngraph::pattern::any_input(), ngraph::pattern::any_input()};
auto fcTwoInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(twoInputs, ngraph::pattern::has_static_shape());
auto fcThreeInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(threeInputs, ngraph::pattern::has_static_shape());
const auto fcTwoOrThreeInputs = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{fcTwoInputs, fcThreeInputs});

ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
auto & pattern_to_output = m.get_pattern_value_map();
auto fc = pattern_to_output[m_fc].get_node_shared_ptr();
auto reshape = pattern_to_output[m_reshape].get_node_shared_ptr();
ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher &m) {
auto fc = std::dynamic_pointer_cast<MKLDNNPlugin::FullyConnectedNode>(m.get_match_root());
auto reshape = std::dynamic_pointer_cast<ngraph::opset1::Reshape>(fc->get_input_node_shared_ptr(0));

// Check that Reshape reshapes 4D tensor to 2D or input shape = output shape
auto shape_in = reshape->input_value(0).get_shape();
Expand Down Expand Up @@ -71,6 +75,6 @@ MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() {
return true;
};

auto m = std::make_shared<ngraph::pattern::Matcher>(m_fc, "ReshapeFullyConnectedFusion");
auto m = std::make_shared<ngraph::pattern::Matcher>(fcTwoOrThreeInputs, "ReshapeFullyConnectedFusion");
register_matcher(m, callback);
}
Loading

0 comments on commit 9807690

Please sign in to comment.