Performance problems fixes. Part 1 (#49)

openvinotoolkit · May 5, 2021 · 9807690 · 9807690
1 parent bcb3c37
commit 9807690
Show file tree

Hide file tree

Showing 26 changed files with 429 additions and 455 deletions.
diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
@@ -22,7 +22,7 @@ set(LAYERS
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_concat_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_conv_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_convert_node.cpp
-#    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_crop_node.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_strided_slice_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_deconv_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_def_conv_node.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_eltwise_node.cpp
@@ -98,7 +98,6 @@ set(LAYERS
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/sparse_to_dense.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/bucketize.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/squeeze.cpp
-#    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/strided_slice.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/topkrois_onnx.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/unique.cpp
 #    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/unsqueeze.cpp
@@ -108,7 +107,7 @@ set(LAYERS
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/topk.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/proposal.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/nodes/proposal_imp.cpp
-#    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/cum_sum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/cum_sum.cpp
 )
 
 file(GLOB SOURCES

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@@ -38,6 +38,7 @@
 #include "utils/debug_capabilities.h"
 #include "utils/node_dumper.h"
 #include "utils/ngraph_utils.hpp"
+#include "utils/cpu_utils.hpp"
 
 #include <ngraph/node.hpp>
 #include <ngraph/function.hpp>
@@ -194,16 +195,6 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana
 
     auto orderedOps = func->get_ordered_ops();
 
-
-//    // The input layer precision has to be equal to the InputData precision
-//    std::map<std::string, Precision> changedPrecision;
-//    for (const auto& input : inputs) {
-//        auto inputLayer = getCreatorLayer(input.second->getInputData()).lock();
-//        if (inputLayer) {
-//            inputLayer->precision = inputLayer->outData[0]->getTensorDesc().getPrecision();
-//        }
-//    }
-//
 //  // TODO [NM]: unordered_map is preferred from performance perspective. Needs hash for ngraph::Node
 //    std::unordered_map<ngraph::Node, MKLDNNNodePtr> op2node;
     std::map<std::shared_ptr<ngraph::Node>, MKLDNNNodePtr> op2node;
@@ -287,6 +278,28 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana
         graphEdges.push_back(edge);
         graphNodes.push_back(outNode);
     }
+
+    // change precision for input/output nodes to avoid extra data conversion when set input/output blobs
+    // also we need to change input/output precisions for consumers/producers to avoid inserting reorder
+    for (auto &input : inputNodesMap) {
+        const auto precToSet = normalizeToSupportedPrecision(inputsInfo.at(input.first)->getPrecision());
+        input.second->setOriginalOutputPrecisionAtPort(0, precToSet);
+        const auto childEdges = input.second->getChildEdgesAtPort(0);
+        for (size_t i = 0; i < childEdges.size(); i++) {
+            const auto child = childEdges[i]->getChild();
+            child->setOriginalInputPrecisionAtPort(childEdges[i]->getOutputNum(), precToSet);
+        }
+    }
+    for (auto &output : outputNodesMap) {
+        const auto precToSet = normalizeToSupportedPrecision(outputsInfo.at(output.first)->getPrecision());
+        output.second->setOriginalInputPrecisionAtPort(0, precToSet);
+        const auto parentEdges = output.second->getParentEdgesAtPort(0);
+        for (size_t i = 0; i < parentEdges.size(); i++) {
+            const auto parent = parentEdges[i]->getChild();
+            parent->setOriginalInputPrecisionAtPort(parentEdges[i]->getOutputNum(), precToSet);
+        }
+    }
+
 //
 //    // Replicate input nodes
 //    for (const auto& input : inputs) {

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@@ -77,6 +77,9 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
     FuseConvolutionAndZeroPoints(graph);
     graph.RemoveDroppedNodes();
 
+    FuseConvolutionAndSimpleOperationThroughMaxPool(graph);
+    graph.RemoveDroppedNodes();
+
 // TODO [NM]: While fusing simple operation into any node (except Eltwise) we need to check that other inputs are Constant nodes.
     FuseConvolutionAndSimpleOperation(graph);
     graph.RemoveDroppedNodes();
@@ -1022,6 +1025,55 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
     // }
 }
 
+// TODO: mandrono: unite with FuseConvolutionAndSimpleOperation
+void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(MKLDNNGraph &graph) {
+    auto& graphNodes = graph.GetNodes();
+
+    auto isSutableParentNode = [](MKLDNNNodePtr node) {
+        return (node->getType() == Convolution || node->getType() == BinaryConvolution) && node->getChildEdges().size() == 1 &&
+               node->getOriginalOutputPrecisionAtPort(0) == Precision::FP32;
+    };
+
+    auto parent = graphNodes.begin();
+    while (parent != graphNodes.end()) {
+        auto parentNode = *parent;
+        if (!isSutableParentNode(parentNode)) {
+            parent++;
+            continue;
+        }
+
+        auto childNode = parentNode->getChildEdgeAt(0)->getChild();
+        if (childNode->getAlgorithm() != PoolingMax || childNode->getChildEdges().size() != 1) {
+            parent++;
+            continue;
+        }
+
+        auto fuseCandidate = childNode->getChildEdgeAt(0)->getChild();
+        if (parentNode->getType() == BinaryConvolution && !parentNode->canFuse(fuseCandidate)) {
+            parent++;
+            continue;
+        }
+
+        if (!one_of(fuseCandidate->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseBoundedRelu, EltwiseClamp, EltwiseTanh,
+                                                   EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
+                                                   EltwiseRoundHalfAwayFromZero, EltwiseLinear, EltwiseAbs, EltwiseSquare, EltwiseSqrt)) {
+            parent++;
+            continue;
+        }
+        parentNode->addFusedNode(fuseCandidate);
+        parentNode->addOriginalLayer(fuseCandidate->getOriginalLayers());
+        auto parentEdges = fuseCandidate->parentEdges;
+        for (auto &parentEdge : parentEdges) {
+            auto p_edge = parentEdge.lock();
+            if (p_edge->getParent() == childNode)
+                continue;
+
+            removeEdge(graph, p_edge);
+        }
+        graph.DropNode(fuseCandidate);
+    }
+}
+
 void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph) {
     auto& graphNodes = graph.GetNodes();
 

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
@@ -24,6 +24,7 @@ class MKLDNNGraphOptimizer {
     void FuseMultiplyAndAdd(MKLDNNGraph &graph);
     void MergeTwoEqualScaleShifts(MKLDNNGraph& graph);
     void FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph);
+    void FuseConvolutionAndSimpleOperationThroughMaxPool(MKLDNNGraph &graph);
     void FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph);
     void FuseConvolutionAndDepthwise(MKLDNNGraph &graph);
     void FuseConvolutionAndDWConvolution(MKLDNNGraph &graph);

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
@@ -20,7 +20,8 @@
 #include "nodes/common/cpu_memcpy.h"
 #include "mkldnn_async_infer_request.h"
 #include <debug.h>
-
+#include "utils/general_utils.h"
+#include "utils/cpu_utils.hpp"
 
 MKLDNNPlugin::MKLDNNInferRequest::MKLDNNInferRequest(InferenceEngine::InputsDataMap     networkInputs,
                                                      InferenceEngine::OutputsDataMap    networkOutputs,
@@ -103,33 +104,14 @@ void MKLDNNPlugin::MKLDNNInferRequest::PushInputData() {
             IE_THROW() << "Input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name " << input.first;
         }
         auto inPrec = input.second->getTensorDesc().getPrecision();
+        if (graph->hasMeanImageFor(input.first) && one_of(inPrec, InferenceEngine::Precision::U8, InferenceEngine::Precision::BOOL)) {
+            inPrec = InferenceEngine::Precision::FP32;
+        } else {
+            inPrec = normalizeToSupportedPrecision(inPrec);
+        }
 
-        switch (inPrec) {
-            // these precisions are supported by mkldnn, so we push the blob directly
-            case InferenceEngine::Precision::I8:
-            case InferenceEngine::Precision::I32:
-            case InferenceEngine::Precision::BF16:
-            case InferenceEngine::Precision::FP32: {
-                break;
-            }
-            // these precisions are supported by mkldnn, so we push the blob directly
-            // BUT if a mean image exists, we convert the blob and send FP32
-            case InferenceEngine::Precision::U8:
-            case InferenceEngine::Precision::BOOL: {
-                if (graph->hasMeanImageFor(input.first))
-                    inPrec = InferenceEngine::Precision::FP32;
-                break;
-            }
-            // these precisions are unsupported by mkldnn, so we convert the blob and send I32
-            case InferenceEngine::Precision::U16:
-            case InferenceEngine::Precision::I16:
-            case InferenceEngine::Precision::I64:
-            case InferenceEngine::Precision::U64: {
-                inPrec = InferenceEngine::Precision::I32;
-                break;
-            }
-            default:
-                IE_THROW() << "Unsupported input precision " << input.second->getTensorDesc().getPrecision();
+        if (inPrec == InferenceEngine::Precision::UNSPECIFIED) {
+            IE_THROW() << "Unsupported input precision " << input.second->getTensorDesc().getPrecision();
         }
 
         // User can initialize input via setBlob API using tensorDesc with default (ANY) layout.
@@ -257,11 +239,10 @@ InferenceEngine::Blob::Ptr MKLDNNPlugin::MKLDNNInferRequest::GetBlob(const std::
 
         _inputs[name] = make_blob_with_precision(desc);
         _inputs[name]->allocate();
-        // [NM] TODO mandrono
-        // if (desc.getPrecision() == originPrecision &&
-        //         graph->_meanImages.find(name) == graph->_meanImages.end() && !graph->getProperty().batchLimit) {
-        //     externalPtr[name] = _inputs[name]->buffer();
-        // }
+        if (desc.getPrecision() == originPrecision &&
+                graph->_meanImages.find(name) == graph->_meanImages.end() && !graph->getProperty().batchLimit) {
+            externalPtr[name] = _inputs[name]->buffer();
+        }
         data = _inputs[name];
         checkBlob(data, name, true);
         return data;
@@ -287,10 +268,9 @@ InferenceEngine::Blob::Ptr MKLDNNPlugin::MKLDNNInferRequest::GetBlob(const std::
 
         _outputs[name] = make_blob_with_precision(desc);
         _outputs[name]->allocate();
-        // [NM] TODO mandrono
-        // if (desc.getPrecision() == originPrecision && !graph->getProperty().batchLimit) {
-        //     externalPtr[name] = _outputs[name]->buffer();
-        // }
+        if (desc.getPrecision() == originPrecision && !graph->getProperty().batchLimit) {
+            externalPtr[name] = _outputs[name]->buffer();
+        }
         data = _outputs[name];
         checkBlob(data, name, false);
         return data;
@@ -354,13 +334,12 @@ void MKLDNNPlugin::MKLDNNInferRequest::SetBlob(const std::string& name, const In
                 IE_THROW(ParameterMismatch) << "Failed to set input blob. Blocking descriptor mismatch.";
             }
 
-            // [NM] TODO mandrono: if input precision == FP32 but node precision != FP32 convertion doesn't performed
-            // if (data->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32 &&
-            //     graph->_meanImages.find(name) == graph->_meanImages.end() && !graph->getProperty().batchLimit) {
-            //     externalPtr[name] = data->buffer();
-            // } else if (externalPtr.find(name) != externalPtr.end()) {
-            //     externalPtr.erase(name);
-            // }
+            if (data->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32 &&
+                graph->_meanImages.find(name) == graph->_meanImages.end() && !graph->getProperty().batchLimit) {
+                externalPtr[name] = data->buffer();
+            } else if (externalPtr.find(name) != externalPtr.end()) {
+                externalPtr.erase(name);
+            }
             _inputs[name] = data;
         }
     } else {
@@ -387,13 +366,12 @@ void MKLDNNPlugin::MKLDNNInferRequest::SetBlob(const std::string& name, const In
                 IE_THROW(ParameterMismatch) << "Failed to set output blob. Blocking descriptor mismatch.";
         }
 
-        // TODO: [NM]
-        // if (data->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32 &&
-        //         !graph->getProperty().batchLimit) {
-        //     externalPtr[name] = data->buffer();
-        // } else if (externalPtr.find(name) != externalPtr.end()) {
-        //     externalPtr.erase(name);
-        // }
+        if (data->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32 &&
+                !graph->getProperty().batchLimit) {
+            externalPtr[name] = data->buffer();
+        } else if (externalPtr.find(name) != externalPtr.end()) {
+            externalPtr.erase(name);
+        }
         _outputs[name] = data;
     }
 }

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@@ -107,6 +107,8 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
         { "PRelu", Eltwise },
         { "Erf", Eltwise },
         { "Reshape", Reshape },
+        { "Squeeze", Reshape },
+        { "Unsqueeze", Reshape },
         { "Softmax", Softmax },
         { "Reorder", Reorder },
         { "Roll", Roll },
@@ -145,7 +147,7 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
 //        { "Eltwise", Eltwise },
 //        { "Mod", Eltwise },
 //        { "Power", Eltwise },
-//        { "StridedSlice", StridedSlice },
+        { "StridedSlice", StridedSlice },
 //        { "Reshape", Reshape },
         { "Tile", Tile },
 //        { "SimplerNMS", SimplerNMS },
@@ -163,7 +165,7 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
 //        { "RNNSequence", RNNSeq },
         { "FakeQuantize", FakeQuantize },
 //        { "BinaryConvolution", BinaryConvolution },
-//        { "DeformableConvolution", DeformableConvolution },
+        { "DeformableConvolution", DeformableConvolution },
 //        { "TensorIterator", TensorIterator },
 //        { "Loop", TensorIterator },
         { "ReadValue", MemoryInput},  // for construction from name ctor, arbitrary name is used
@@ -1284,19 +1286,20 @@ MKLDNNNode* MKLDNNNode::NodesFactory::create(const std::shared_ptr<ngraph::Node>
         }
     }
 
-    if (newNode == nullptr) {
-        try {
-            std::unique_ptr<MKLDNNNode> ol(new MKLDNNReferenceNode(op, eng, w_cache, errorMessage));
-            if (ol != nullptr && ol->created(extMgr))
-                newNode = ol.release();
-        } catch (const InferenceEngine::Exception& ex) {
-            if (ex.getStatus() != NOT_IMPLEMENTED) {
-                throw;
-            } else {
-                errorMessage += getExceptionDescWithoutStatus(ex);
-            }
-        }
-    }
+    // TODO [NM]: enable after all nodes will be migrated on ngraph
+    // if (newNode == nullptr) {
+    //     try {
+    //         std::unique_ptr<MKLDNNNode> ol(new MKLDNNReferenceNode(op, eng, w_cache, errorMessage));
+    //         if (ol != nullptr && ol->created(extMgr))
+    //             newNode = ol.release();
+    //     } catch (const InferenceEngine::Exception& ex) {
+    //         if (ex.getStatus() != NOT_IMPLEMENTED) {
+    //             throw;
+    //         } else {
+    //             errorMessage += getExceptionDescWithoutStatus(ex);
+    //         }
+    //     }
+    // }
 
 // TODO [NM]: Not implemented
 //    //  WA-start : TI node requires all attributes to construct internal subgpath

diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@@ -582,6 +582,13 @@ class MKLDNNNode : public InferenceEngine::details::no_copy {
         originalInputPrecisions[port] = precision;
     }
 
+    void setOriginalOutputPrecisionAtPort(size_t port, InferenceEngine::Precision precision) {
+        if (originalOutputPrecisions.size() <= port) {
+            IE_THROW() << "Incorrect output port number for node " << getName();
+        }
+        originalOutputPrecisions[port] = precision;
+    }
+
     void addOriginalInputPrecision(InferenceEngine::Precision precision) {
         originalInputPrecisions.push_back(precision);
     }

diff --git a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fc_fusion.cpp b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fc_fusion.cpp
@@ -8,17 +8,21 @@
 #include <ngraph/opsets/opset1.hpp>
 #include <ngraph/rt_info.hpp>
 #include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/pattern/op/or.hpp>
 
 NGRAPH_RTTI_DEFINITION(MKLDNNPlugin::ReshapeFullyConnectedFusion, "ReshapeFullyConnectedFusion", 0);
 
 MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() {
     auto m_reshape = ngraph::pattern::wrap_type<ngraph::opset1::Reshape>(ngraph::pattern::has_static_shape());
-    auto m_fc = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>({m_reshape, ngraph::pattern::any_input()});
+    ngraph::OutputVector twoInputs = {m_reshape, ngraph::pattern::any_input()};
+    ngraph::OutputVector threeInputs = {m_reshape, ngraph::pattern::any_input(), ngraph::pattern::any_input()};
+    auto fcTwoInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(twoInputs, ngraph::pattern::has_static_shape());
+    auto fcThreeInputs = ngraph::pattern::wrap_type<MKLDNNPlugin::FullyConnectedNode>(threeInputs, ngraph::pattern::has_static_shape());
+    const auto fcTwoOrThreeInputs = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{fcTwoInputs, fcThreeInputs});
 
-    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
-        auto & pattern_to_output = m.get_pattern_value_map();
-        auto fc = pattern_to_output[m_fc].get_node_shared_ptr();
-        auto reshape = pattern_to_output[m_reshape].get_node_shared_ptr();
+    ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher &m) {
+        auto fc = std::dynamic_pointer_cast<MKLDNNPlugin::FullyConnectedNode>(m.get_match_root());
+        auto reshape = std::dynamic_pointer_cast<ngraph::opset1::Reshape>(fc->get_input_node_shared_ptr(0));
 
         // Check that Reshape reshapes 4D tensor to 2D or input shape = output shape
         auto shape_in = reshape->input_value(0).get_shape();
@@ -71,6 +75,6 @@ MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() {
         return true;
     };
 
-    auto m = std::make_shared<ngraph::pattern::Matcher>(m_fc, "ReshapeFullyConnectedFusion");
+    auto m = std::make_shared<ngraph::pattern::Matcher>(fcTwoOrThreeInputs, "ReshapeFullyConnectedFusion");
     register_matcher(m, callback);
 }