From a3e7466413e959c47831ebc3ebd535998c84c951 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Tue, 3 Aug 2021 09:44:10 +0300 Subject: [PATCH] [GPU] Disabled default mem allocation for input_layout --- .../clDNN/runtime/ocl/ocl_memory.cpp | 3 ++ .../thirdparty/clDNN/src/crop.cpp | 5 ++- .../clDNN/src/include/primitive_inst.h | 18 ++++++----- .../clDNN/src/include/program_node.h | 4 +++ .../thirdparty/clDNN/src/input_layout.cpp | 2 ++ .../thirdparty/clDNN/src/loop.cpp | 5 +-- .../thirdparty/clDNN/src/network.cpp | 31 +++++++++---------- .../thirdparty/clDNN/src/program.cpp | 14 ++++++++- .../thirdparty/clDNN/src/reorder.cpp | 2 +- .../thirdparty/clDNN/src/reshape.cpp | 2 +- .../clDNN/tests/test_cases/memory_test.cpp | 14 ++++----- 11 files changed, 59 insertions(+), 41 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp index 0023213fe50605..253084bf341324 100644 --- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp @@ -362,6 +362,9 @@ shared_mem_params gpu_usm::get_internal_params() const { std::vector ocl_surfaces_lock::get_handles(std::vector mem) const { std::vector res; for (auto& m : mem) { + if (!m) + continue; + auto mem_type = m->get_internal_params().mem_type; if (mem_type == shared_mem_type::shared_mem_vasurface || mem_type == shared_mem_type::shared_mem_dxbuffer) { res.push_back(static_cast(m->get_internal_params().mem)); diff --git a/inference-engine/thirdparty/clDNN/src/crop.cpp b/inference-engine/thirdparty/clDNN/src/crop.cpp index fe45ed3baf88dd..df99a948e3cb96 100644 --- a/inference-engine/thirdparty/clDNN/src/crop.cpp +++ b/inference-engine/thirdparty/clDNN/src/crop.cpp @@ -129,10 +129,8 @@ crop_inst::typed_primitive_inst(network& network, crop_node const& node) : paren ref_in_sizes, "Invalid Batch offset: exceeds data for output!"); - if (node.can_be_optimized()) { - build_deps(); + if (node.can_be_optimized() && !node.has_mutable_tensors()) reuse_input(); - } } void crop_inst::on_execute() { @@ -146,6 +144,7 @@ void crop_inst::on_execute() { } void crop_inst::reuse_input() { + build_deps(); _output = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout()); } } // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h index 863d2267dc1c3c..88551aefca6c28 100644 --- a/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h +++ b/inference-engine/thirdparty/clDNN/src/include/primitive_inst.h @@ -6,6 +6,7 @@ #pragma once #include "cldnn/primitives/primitive.hpp" #include "cldnn/primitives/concatenation.hpp" +#include "cldnn/primitives/input_layout.hpp" #include "cldnn/runtime/error_handler.hpp" #include "cldnn/runtime/event.hpp" #include "cldnn/runtime/memory.hpp" @@ -77,7 +78,11 @@ class primitive_inst { memory& dep_memory(size_t index) const { return dependencies().at(index)->output_memory(); } memory::ptr dep_memory_ptr(size_t index) const { return dependencies().at(index)->output_memory_ptr(); } - memory& output_memory() const { return *_output; } + memory& output_memory() const { + if (!_output) + throw std::runtime_error("[GPU] output is not allocated"); + return *_output; + } memory::ptr output_memory_ptr() const { return _output; } size_t inputs_memory_count() const { return _node.get_primitive()->input_size(); } primitive_type_id type() const { return _node.type(); } @@ -128,12 +133,8 @@ class primitive_inst { size_t get_fused_mem_count() const { return _node.get_fused_inputs_count(); } size_t get_fused_mem_offset() const { return _node.get_fused_primitives()[0].dep_start_idx; } - bool has_mutable_input() const { - return _has_mutable_input; - } - - void set_mutable_input(bool val) { - _has_mutable_input = val; + bool has_mutable_tensors() const { + return _node.has_mutable_tensors(); } bool is_output() const { @@ -169,7 +170,6 @@ class primitive_inst { bool _output_changed; // todo: implement output reuse if neither of inputs has changed bool _has_valid_input = true; // by default all primitives has valid inputs, exception is input_layout (see input_layout_inst) - bool _has_mutable_input = false; memory::ptr allocate_output(); static std::vector> build_exec_deps( @@ -260,6 +260,8 @@ class typed_primitive_inst_base : public primitive_inst { typ_node.get_users().front()->can_be_optimized()) { // check if the only user is concat return false; } + if (typ_node.template is_type()) + return false; return true; } }; diff --git a/inference-engine/thirdparty/clDNN/src/include/program_node.h b/inference-engine/thirdparty/clDNN/src/include/program_node.h index 2b38d85d966a60..a19eb94b49a2c8 100644 --- a/inference-engine/thirdparty/clDNN/src/include/program_node.h +++ b/inference-engine/thirdparty/clDNN/src/include/program_node.h @@ -220,6 +220,9 @@ struct program_node { bool can_be_optimized() const { return optimized; } void can_be_optimized(bool opt) { optimized = opt; } + bool has_mutable_tensors() const { return _has_mutable_tensors; } + void set_mutable_tensors(bool val) { _has_mutable_tensors = val; } + // check/set if the node's buffer can be shared during the memory pool optimization bool can_share_buffer() const { return share_buffer; } void can_share_buffer(bool share) { share_buffer = share; } @@ -337,6 +340,7 @@ struct program_node { uint8_t user_mark = 0; bool optimized = false; bool share_buffer = true; + bool _has_mutable_tensors = false; std::array _support_padding_in_axis; mutable bool has_reused_memory = false; diff --git a/inference-engine/thirdparty/clDNN/src/input_layout.cpp b/inference-engine/thirdparty/clDNN/src/input_layout.cpp index ddc4c5470ac2e8..5d84a2002bdfbf 100644 --- a/inference-engine/thirdparty/clDNN/src/input_layout.cpp +++ b/inference-engine/thirdparty/clDNN/src/input_layout.cpp @@ -35,6 +35,8 @@ void input_layout_inst::set_data(memory::ptr mem) { if (mem->is_allocated_by(get_network().get_engine())) { _output = mem; } else { + if (!_output) + _output = get_network().get_engine().allocate_memory(mem->get_layout(), false); mem_lock src(mem, get_network().get_stream()); mem_lock dst(_output, get_network().get_stream()); std::copy(src.begin(), src.end(), dst.begin()); diff --git a/inference-engine/thirdparty/clDNN/src/loop.cpp b/inference-engine/thirdparty/clDNN/src/loop.cpp index d36f9476bcb8f4..3c0ae5cd7b4661 100644 --- a/inference-engine/thirdparty/clDNN/src/loop.cpp +++ b/inference-engine/thirdparty/clDNN/src/loop.cpp @@ -349,7 +349,7 @@ void loop_inst::preprocess_input_memory() { bool is_concatenated_input = (input_map->axis >= 0); if (is_concatenated_input) { layout sliced_layout - = body_network->get_primitive(input_map->internal_id)->output_memory().get_layout(); + = body_network->get_primitive(input_map->internal_id)->get_node().get_output_layout(); const int64_t max_iteration = node.get_max_iteration(); std::vector sliced_mems; sliced_mems.reserve(max_iteration); @@ -367,9 +367,6 @@ void loop_inst::preprocess_input_memory() { concatenated_input_mem_mapping_info.sliced_data_prim = body_network->get_primitive(input_map->internal_id); iteration_mem.push_back(concatenated_input_mem_mapping_info); } else { - if (memory->get_layout().data_type != body_network->get_primitive(input_map->internal_id)->output_memory().get_layout().data_type) { - CLDNN_ERROR_MESSAGE(id(), "incompatible datatypes"); - } body_network->set_input_data(input_map->internal_id, memory); } } diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp index 1aef6da691c232..0ee9c6bbecfd0c 100644 --- a/inference-engine/thirdparty/clDNN/src/network.cpp +++ b/inference-engine/thirdparty/clDNN/src/network.cpp @@ -275,7 +275,8 @@ void network::set_arguments() { return; for (auto const& prim : _exec_order) { - prim->set_arguments(); + if (!prim->has_mutable_tensors()) + prim->set_arguments(); } _reset_arguments = false; } @@ -325,22 +326,26 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr

> chain; std::stack> candidates; auto& eng = get_engine(); - const auto& mem_orig = p_inst->output_memory(); + const auto mem_orig = p_inst->output_memory_ptr(); auto add_mdata_chain = [&](std::shared_ptr& p_inst) { auto mdata_ptr = std::dynamic_pointer_cast(p_inst); - if (!mdata_ptr) + if (!mdata_ptr || !mem_orig) return; // special handling for mutable data, which can share // its attached memory with both its inputs and outputs for (auto& dep : p_inst->dependencies()) { + if (dep->has_mutable_tensors()) + continue; // check dependencies - if (eng.is_the_same_buffer(mem_orig, dep->output_memory())) { + if (eng.is_the_same_buffer(*mem_orig, dep->output_memory())) { chain.push_back(std::const_pointer_cast(dep)); } // then second order dependencies for (auto& second_dep : dep->dependencies()) { - if (eng.is_the_same_buffer(mem_orig, second_dep->output_memory())) { + if (second_dep->has_mutable_tensors()) + continue; + if (eng.is_the_same_buffer(*mem_orig, second_dep->output_memory())) { chain.push_back(std::const_pointer_cast(second_dep)); } } @@ -350,7 +355,7 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr

get_users(); for (const auto& usr : users) { auto usr_prim = get_primitive(usr->id()); - if (eng.is_the_same_buffer(mem_orig, usr_prim->output_memory())) { + if (eng.is_the_same_buffer(*mem_orig, usr_prim->output_memory())) { chain.push_back(usr_prim); } } @@ -364,11 +369,11 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr

output_memory(); - if (eng.is_the_same_buffer(mem_orig, mem_cand)) { + if (eng.is_the_same_buffer(*mem_orig, mem_cand)) { auto nc_cand = std::const_pointer_cast(cand); chain.push_back(nc_cand); add_mdata_chain(nc_cand); @@ -379,7 +384,7 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr

output_memory(); - if (eng.is_the_same_buffer(mem_orig, mem_dep)) { + if (eng.is_the_same_buffer(*mem_orig, mem_dep)) { auto nc_dep = std::const_pointer_cast(dep); chain.push_back(nc_dep); add_mdata_chain(nc_dep); @@ -554,7 +559,7 @@ void network::execute_impl(const std::vector& events) { // If a node has mutable input or it's an output, then the input/output buffers might be changed // So we need to set arguments on each execution. - if (inst->has_mutable_input() || inst->is_output()) { + if (inst->has_mutable_tensors() || inst->is_output()) { inst->set_arguments(); } execute_primitive(inst, events); @@ -702,12 +707,6 @@ void network::allocate_primitive_instance(program_node const& node) { return; auto inst = node.type()->create_instance(*this, node); - for (auto& dep : node.get_dependencies()) { - if (dep->is_type() || dep->is_type() || dep->can_be_optimized()) { - inst->set_mutable_input(true); - break; - } - } _primitives[node.id()] = inst; if (node.is_input()) diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp index 3caf87274074a4..a47b0ed7e69409 100644 --- a/inference-engine/thirdparty/clDNN/src/program.cpp +++ b/inference-engine/thirdparty/clDNN/src/program.cpp @@ -142,6 +142,16 @@ void program::init_kernels() { for (auto& n : get_processing_order()) { if (n->get_selected_impl()) n->get_selected_impl()->init_kernels(); + + if (n->is_type() || n->can_be_optimized()) + n->set_mutable_tensors(true); + + for (auto& dep : n->get_dependencies()) { + if (dep->is_type() || dep->is_type() || dep->can_be_optimized()) { + n->set_mutable_tensors(true); + break; + } + } } } @@ -711,8 +721,10 @@ void program::reverse_connection(program_node& dep_node, program_node& user_node program_node& program::get_or_create(std::shared_ptr prim) { auto itr = nodes_map.lower_bound(prim->id); - if (itr != nodes_map.end() && itr->first == prim->id) + if (itr != nodes_map.end() && itr->first == prim->id) { + std::cerr << "get_or_create: get!\n"; return *itr->second; + } auto new_node = prim->type->create_node(*this, prim); nodes_map.insert(itr, {prim->id, new_node}); diff --git a/inference-engine/thirdparty/clDNN/src/reorder.cpp b/inference-engine/thirdparty/clDNN/src/reorder.cpp index 8985fc2f07325f..8d0cc048b8519e 100644 --- a/inference-engine/thirdparty/clDNN/src/reorder.cpp +++ b/inference-engine/thirdparty/clDNN/src/reorder.cpp @@ -188,7 +188,7 @@ std::string reorder_inst::to_string(reorder_node const& node) { reorder_inst::typed_primitive_inst(network& network, reorder_node const& node) : parent(network, node, !node.can_be_optimized()) { - if (node.can_be_optimized()) + if (node.can_be_optimized() && !node.has_mutable_tensors()) reuse_input(); auto input_layout = node.input().get_output_layout(); diff --git a/inference-engine/thirdparty/clDNN/src/reshape.cpp b/inference-engine/thirdparty/clDNN/src/reshape.cpp index b6de6de3b88fe7..673dbb7eaef243 100644 --- a/inference-engine/thirdparty/clDNN/src/reshape.cpp +++ b/inference-engine/thirdparty/clDNN/src/reshape.cpp @@ -83,7 +83,7 @@ reshape_inst::typed_primitive_inst(network& network, reshape_node const& node) : // then create new memory object as the reinterpreted output of the previous primitive if (!node.can_be_optimized()) _output = allocate_output(); - else + else if (!node.has_mutable_tensors()) reuse_input(); } diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp index 861cd2672cd4e9..0e0937370c59bf 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp @@ -86,7 +86,7 @@ TEST(memory_pool, basic_non_padded_relu_pipe) { network.set_input_data("input", input); auto outputs = network.execute(); - EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 64); + EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 48); } TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) { @@ -118,7 +118,7 @@ TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) { network.set_input_data("input", input); auto outputs = network.execute(); - EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)896); + EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)640); } TEST(memory_pool, multi_outputs_network) { @@ -153,7 +153,7 @@ TEST(memory_pool, multi_outputs_network) { network.set_input_data("input", input); auto outputs = network.execute(); - EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)1536); + EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)1280); } TEST(memory_pool, oooq) { @@ -191,7 +191,7 @@ TEST(memory_pool, oooq) { network.set_input_data("input", input); auto outputs = network.execute(); - EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2560); + EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2304); } TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) { @@ -396,14 +396,14 @@ TEST(memory_pool, shared_mem_pool_diff_batches) { auto outputs = network_first.execute(); auto dev_info = engine->get_device_info(); - EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)3928); + EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)2392); topo.change_input_layout("input", input_1->get_layout());//change input layout to batch=1 network network_second(*engine, topo, bo); network_second.set_input_data("input", input_1); auto outputs_second = network_second.execute(); - EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)3928); + EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)2808); } TEST(memory_pool, shared_dep_two_output) { @@ -449,7 +449,7 @@ TEST(memory_pool, shared_dep_two_output) { network network(*engine, topo, bo); auto outputs = network.execute(); - EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)256); + EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)192); } TEST(memory_pool, non_opt_intermidate_opt_after) {