From a3e7466413e959c47831ebc3ebd535998c84c951 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Tue, 3 Aug 2021 09:44:10 +0300
Subject: [PATCH] [GPU] Disabled default mem allocation for input_layout

---
 .../clDNN/runtime/ocl/ocl_memory.cpp          |  3 ++
 .../thirdparty/clDNN/src/crop.cpp             |  5 ++-
 .../clDNN/src/include/primitive_inst.h        | 18 ++++++-----
 .../clDNN/src/include/program_node.h          |  4 +++
 .../thirdparty/clDNN/src/input_layout.cpp     |  2 ++
 .../thirdparty/clDNN/src/loop.cpp             |  5 +--
 .../thirdparty/clDNN/src/network.cpp          | 31 +++++++++----------
 .../thirdparty/clDNN/src/program.cpp          | 14 ++++++++-
 .../thirdparty/clDNN/src/reorder.cpp          |  2 +-
 .../thirdparty/clDNN/src/reshape.cpp          |  2 +-
 .../clDNN/tests/test_cases/memory_test.cpp    | 14 ++++-----
 11 files changed, 59 insertions(+), 41 deletions(-)

diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
index 0023213fe50605..253084bf341324 100644
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
@@ -362,6 +362,9 @@ shared_mem_params gpu_usm::get_internal_params() const {
 std::vector<cl_mem> ocl_surfaces_lock::get_handles(std::vector<memory::ptr> mem) const {
     std::vector<cl_mem> res;
     for (auto& m : mem) {
+        if (!m)
+            continue;
+
         auto mem_type = m->get_internal_params().mem_type;
         if (mem_type == shared_mem_type::shared_mem_vasurface || mem_type == shared_mem_type::shared_mem_dxbuffer) {
             res.push_back(static_cast<cl_mem>(m->get_internal_params().mem));
diff --git a/inference-engine/thirdparty/clDNN/src/crop.cpp b/inference-engine/thirdparty/clDNN/src/crop.cpp
index fe45ed3baf88dd..df99a948e3cb96 100644
--- a/inference-engine/thirdparty/clDNN/src/crop.cpp
+++ b/inference-engine/thirdparty/clDNN/src/crop.cpp
@@ -129,10 +129,8 @@ crop_inst::typed_primitive_inst(network& network, crop_node const& node) : paren
                                        ref_in_sizes,
                                        "Invalid Batch offset: exceeds data for output!");
 
-    if (node.can_be_optimized()) {
-        build_deps();
+    if (node.can_be_optimized() && !node.has_mutable_tensors())
         reuse_input();
-    }
 }
 
 void crop_inst::on_execute() {
@@ -146,6 +144,7 @@ void crop_inst::on_execute() {
 }
 
 void crop_inst::reuse_input() {
+    build_deps();
     _output = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout());
 }
 }  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h
index 863d2267dc1c3c..88551aefca6c28 100644
--- a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h
+++ b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h
@@ -6,6 +6,7 @@
 #pragma once
 #include "cldnn/primitives/primitive.hpp"
 #include "cldnn/primitives/concatenation.hpp"
+#include "cldnn/primitives/input_layout.hpp"
 #include "cldnn/runtime/error_handler.hpp"
 #include "cldnn/runtime/event.hpp"
 #include "cldnn/runtime/memory.hpp"
@@ -77,7 +78,11 @@ class primitive_inst {
 
     memory& dep_memory(size_t index) const { return dependencies().at(index)->output_memory(); }
     memory::ptr dep_memory_ptr(size_t index) const { return dependencies().at(index)->output_memory_ptr(); }
-    memory& output_memory() const { return *_output; }
+    memory& output_memory() const {
+        if (!_output)
+            throw std::runtime_error("[GPU] output is not allocated");
+        return *_output;
+    }
     memory::ptr output_memory_ptr() const { return _output; }
     size_t inputs_memory_count() const { return _node.get_primitive()->input_size(); }
     primitive_type_id type() const { return _node.type(); }
@@ -128,12 +133,8 @@ class primitive_inst {
     size_t get_fused_mem_count() const { return _node.get_fused_inputs_count(); }
     size_t get_fused_mem_offset() const { return _node.get_fused_primitives()[0].dep_start_idx; }
 
-    bool has_mutable_input() const {
-        return _has_mutable_input;
-    }
-
-    void set_mutable_input(bool val) {
-        _has_mutable_input = val;
+    bool has_mutable_tensors() const {
+        return _node.has_mutable_tensors();
     }
 
     bool is_output() const {
@@ -169,7 +170,6 @@ class primitive_inst {
     bool _output_changed;  // todo: implement output reuse if neither of inputs has changed
     bool _has_valid_input =
         true;  // by default all primitives has valid inputs, exception is input_layout (see input_layout_inst)
-    bool _has_mutable_input = false;
 
     memory::ptr allocate_output();
     static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
@@ -260,6 +260,8 @@ class typed_primitive_inst_base : public primitive_inst {
             typ_node.get_users().front()->can_be_optimized()) {  // check if the only user is concat
             return false;
         }
+        if (typ_node.template is_type<input_layout>())
+            return false;
         return true;
     }
 };
diff --git a/inference-engine/thirdparty/clDNN/src/include/program_node.h b/inference-engine/thirdparty/clDNN/src/include/program_node.h
index 2b38d85d966a60..a19eb94b49a2c8 100644
--- a/inference-engine/thirdparty/clDNN/src/include/program_node.h
+++ b/inference-engine/thirdparty/clDNN/src/include/program_node.h
@@ -220,6 +220,9 @@ struct program_node {
     bool can_be_optimized() const { return optimized; }
     void can_be_optimized(bool opt) { optimized = opt; }
 
+    bool has_mutable_tensors() const { return _has_mutable_tensors; }
+    void set_mutable_tensors(bool val) { _has_mutable_tensors = val; }
+
     // check/set if the node's buffer can be shared during the memory pool optimization
     bool can_share_buffer() const { return share_buffer; }
     void can_share_buffer(bool share) { share_buffer = share; }
@@ -337,6 +340,7 @@ struct program_node {
     uint8_t user_mark = 0;
     bool optimized = false;
     bool share_buffer = true;
+    bool _has_mutable_tensors = false;
     std::array<bool, tensor_dim_max> _support_padding_in_axis;
 
     mutable bool has_reused_memory = false;
diff --git a/inference-engine/thirdparty/clDNN/src/input_layout.cpp b/inference-engine/thirdparty/clDNN/src/input_layout.cpp
index ddc4c5470ac2e8..5d84a2002bdfbf 100644
--- a/inference-engine/thirdparty/clDNN/src/input_layout.cpp
+++ b/inference-engine/thirdparty/clDNN/src/input_layout.cpp
@@ -35,6 +35,8 @@ void input_layout_inst::set_data(memory::ptr mem) {
     if (mem->is_allocated_by(get_network().get_engine())) {
         _output = mem;
     } else {
+        if (!_output)
+            _output = get_network().get_engine().allocate_memory(mem->get_layout(), false);
         mem_lock<char> src(mem, get_network().get_stream());
         mem_lock<char> dst(_output, get_network().get_stream());
         std::copy(src.begin(), src.end(), dst.begin());
diff --git a/inference-engine/thirdparty/clDNN/src/loop.cpp b/inference-engine/thirdparty/clDNN/src/loop.cpp
index d36f9476bcb8f4..3c0ae5cd7b4661 100644
--- a/inference-engine/thirdparty/clDNN/src/loop.cpp
+++ b/inference-engine/thirdparty/clDNN/src/loop.cpp
@@ -349,7 +349,7 @@ void loop_inst::preprocess_input_memory() {
             bool is_concatenated_input = (input_map->axis >= 0);
             if (is_concatenated_input) {
                 layout sliced_layout
-                    = body_network->get_primitive(input_map->internal_id)->output_memory().get_layout();
+                    = body_network->get_primitive(input_map->internal_id)->get_node().get_output_layout();
                 const int64_t max_iteration = node.get_max_iteration();
                 std::vector<memory::ptr> sliced_mems;
                 sliced_mems.reserve(max_iteration);
@@ -367,9 +367,6 @@ void loop_inst::preprocess_input_memory() {
                 concatenated_input_mem_mapping_info.sliced_data_prim = body_network->get_primitive(input_map->internal_id);
                 iteration_mem.push_back(concatenated_input_mem_mapping_info);
             } else {
-                if (memory->get_layout().data_type != body_network->get_primitive(input_map->internal_id)->output_memory().get_layout().data_type) {
-                    CLDNN_ERROR_MESSAGE(id(), "incompatible datatypes");
-                }
                 body_network->set_input_data(input_map->internal_id, memory);
             }
         }
diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp
index 1aef6da691c232..0ee9c6bbecfd0c 100644
--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@@ -275,7 +275,8 @@ void network::set_arguments() {
         return;
 
     for (auto const& prim : _exec_order) {
-        prim->set_arguments();
+        if (!prim->has_mutable_tensors())
+            prim->set_arguments();
     }
     _reset_arguments = false;
 }
@@ -325,22 +326,26 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr<p
     std::vector<std::shared_ptr<primitive_inst>> chain;
     std::stack<std::shared_ptr<const primitive_inst>> candidates;
     auto& eng = get_engine();
-    const auto& mem_orig = p_inst->output_memory();
+    const auto mem_orig = p_inst->output_memory_ptr();
 
     auto add_mdata_chain = [&](std::shared_ptr<primitive_inst>& p_inst) {
         auto mdata_ptr = std::dynamic_pointer_cast<mutable_data_inst>(p_inst);
-        if (!mdata_ptr)
+        if (!mdata_ptr || !mem_orig)
             return;
         // special handling for mutable data, which can share
         // its attached memory with both its inputs and outputs
         for (auto& dep : p_inst->dependencies()) {
+            if (dep->has_mutable_tensors())
+                continue;
             // check dependencies
-            if (eng.is_the_same_buffer(mem_orig, dep->output_memory())) {
+            if (eng.is_the_same_buffer(*mem_orig, dep->output_memory())) {
                 chain.push_back(std::const_pointer_cast<primitive_inst>(dep));
             }
             // then second order dependencies
             for (auto& second_dep : dep->dependencies()) {
-                if (eng.is_the_same_buffer(mem_orig, second_dep->output_memory())) {
+                if (second_dep->has_mutable_tensors())
+                    continue;
+                if (eng.is_the_same_buffer(*mem_orig, second_dep->output_memory())) {
                     chain.push_back(std::const_pointer_cast<primitive_inst>(second_dep));
                 }
             }
@@ -350,7 +355,7 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr<p
         const auto& users = p_inst->get_users();
         for (const auto& usr : users) {
             auto usr_prim = get_primitive(usr->id());
-            if (eng.is_the_same_buffer(mem_orig, usr_prim->output_memory())) {
+            if (eng.is_the_same_buffer(*mem_orig, usr_prim->output_memory())) {
                 chain.push_back(usr_prim);
             }
         }
@@ -364,11 +369,11 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr<p
     add_mdata_chain(p_inst);
 
     // find all dependencies that are 'optimized'
-    while (!candidates.empty()) {
+    while (mem_orig && !candidates.empty()) {
         auto& cand = candidates.top();
         candidates.pop();
         const auto& mem_cand = cand->output_memory();
-        if (eng.is_the_same_buffer(mem_orig, mem_cand)) {
+        if (eng.is_the_same_buffer(*mem_orig, mem_cand)) {
             auto nc_cand = std::const_pointer_cast<primitive_inst>(cand);
             chain.push_back(nc_cand);
             add_mdata_chain(nc_cand);
@@ -379,7 +384,7 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr<p
                 candidates.push(dep);
             } else {
                 const auto& mem_dep = dep->output_memory();
-                if (eng.is_the_same_buffer(mem_orig, mem_dep)) {
+                if (eng.is_the_same_buffer(*mem_orig, mem_dep)) {
                     auto nc_dep = std::const_pointer_cast<primitive_inst>(dep);
                     chain.push_back(nc_dep);
                     add_mdata_chain(nc_dep);
@@ -554,7 +559,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
 
         // If a node has mutable input or it's an output, then the input/output buffers might be changed
         // So we need to set arguments on each execution.
-        if (inst->has_mutable_input() || inst->is_output()) {
+        if (inst->has_mutable_tensors() || inst->is_output()) {
             inst->set_arguments();
         }
         execute_primitive(inst, events);
@@ -702,12 +707,6 @@ void network::allocate_primitive_instance(program_node const& node) {
         return;
 
     auto inst = node.type()->create_instance(*this, node);
-    for (auto& dep : node.get_dependencies()) {
-        if (dep->is_type<input_layout>() || dep->is_type<mutable_data>() || dep->can_be_optimized()) {
-            inst->set_mutable_input(true);
-            break;
-        }
-    }
 
     _primitives[node.id()] = inst;
     if (node.is_input())
diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp
index 3caf87274074a4..a47b0ed7e69409 100644
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@@ -142,6 +142,16 @@ void program::init_kernels() {
     for (auto& n : get_processing_order()) {
         if (n->get_selected_impl())
             n->get_selected_impl()->init_kernels();
+
+        if (n->is_type<input_layout>() || n->can_be_optimized())
+            n->set_mutable_tensors(true);
+
+        for (auto& dep : n->get_dependencies()) {
+            if (dep->is_type<input_layout>() || dep->is_type<mutable_data>() || dep->can_be_optimized()) {
+                n->set_mutable_tensors(true);
+                break;
+            }
+        }
     }
 }
 
@@ -711,8 +721,10 @@ void program::reverse_connection(program_node& dep_node, program_node& user_node
 
 program_node& program::get_or_create(std::shared_ptr<primitive> prim) {
     auto itr = nodes_map.lower_bound(prim->id);
-    if (itr != nodes_map.end() && itr->first == prim->id)
+    if (itr != nodes_map.end() && itr->first == prim->id) {
+        std::cerr << "get_or_create: get!\n";
         return *itr->second;
+    }
 
     auto new_node = prim->type->create_node(*this, prim);
     nodes_map.insert(itr, {prim->id, new_node});
diff --git a/inference-engine/thirdparty/clDNN/src/reorder.cpp b/inference-engine/thirdparty/clDNN/src/reorder.cpp
index 8985fc2f07325f..8d0cc048b8519e 100644
--- a/inference-engine/thirdparty/clDNN/src/reorder.cpp
+++ b/inference-engine/thirdparty/clDNN/src/reorder.cpp
@@ -188,7 +188,7 @@ std::string reorder_inst::to_string(reorder_node const& node) {
 
 reorder_inst::typed_primitive_inst(network& network, reorder_node const& node)
     : parent(network, node, !node.can_be_optimized()) {
-    if (node.can_be_optimized())
+    if (node.can_be_optimized() && !node.has_mutable_tensors())
         reuse_input();
 
     auto input_layout = node.input().get_output_layout();
diff --git a/inference-engine/thirdparty/clDNN/src/reshape.cpp b/inference-engine/thirdparty/clDNN/src/reshape.cpp
index b6de6de3b88fe7..673dbb7eaef243 100644
--- a/inference-engine/thirdparty/clDNN/src/reshape.cpp
+++ b/inference-engine/thirdparty/clDNN/src/reshape.cpp
@@ -83,7 +83,7 @@ reshape_inst::typed_primitive_inst(network& network, reshape_node const& node) :
     // then create new memory object as the reinterpreted output of the previous primitive
     if (!node.can_be_optimized())
         _output = allocate_output();
-    else
+    else if (!node.has_mutable_tensors())
         reuse_input();
 }
 
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp
index 861cd2672cd4e9..0e0937370c59bf 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp
@@ -86,7 +86,7 @@ TEST(memory_pool, basic_non_padded_relu_pipe) {
     network.set_input_data("input", input);
     auto outputs = network.execute();
 
-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 64);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 48);
  }
 
 TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
@@ -118,7 +118,7 @@ TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
     network.set_input_data("input", input);
     auto outputs = network.execute();
 
-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)896);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)640);
 }
 
 TEST(memory_pool, multi_outputs_network) {
@@ -153,7 +153,7 @@ TEST(memory_pool, multi_outputs_network) {
     network.set_input_data("input", input);
     auto outputs = network.execute();
 
-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)1536);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)1280);
 }
 
 TEST(memory_pool, oooq) {
@@ -191,7 +191,7 @@ TEST(memory_pool, oooq) {
     network.set_input_data("input", input);
     auto outputs = network.execute();
 
-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2560);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2304);
 }
 
 TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) {
@@ -396,14 +396,14 @@ TEST(memory_pool, shared_mem_pool_diff_batches) {
     auto outputs = network_first.execute();
 
     auto dev_info = engine->get_device_info();
-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)3928);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)2392);
 
     topo.change_input_layout("input", input_1->get_layout());//change input layout to batch=1
 
     network network_second(*engine, topo, bo);
     network_second.set_input_data("input", input_1);
     auto outputs_second = network_second.execute();
-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)3928);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)2808);
 }
 
 TEST(memory_pool, shared_dep_two_output) {
@@ -449,7 +449,7 @@ TEST(memory_pool, shared_dep_two_output) {
 
     network network(*engine, topo, bo);
     auto outputs = network.execute();
-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)256);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)192);
 }
 
 TEST(memory_pool, non_opt_intermidate_opt_after) {