Skip to content

Commit

Permalink
[GPU] Disabled default mem allocation for input_layout
Browse files Browse the repository at this point in the history
  • Loading branch information
vladimir-paramuzov committed Aug 6, 2021
1 parent 371fc7a commit a3e7466
Show file tree
Hide file tree
Showing 11 changed files with 59 additions and 41 deletions.
3 changes: 3 additions & 0 deletions inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,9 @@ shared_mem_params gpu_usm::get_internal_params() const {
std::vector<cl_mem> ocl_surfaces_lock::get_handles(std::vector<memory::ptr> mem) const {
std::vector<cl_mem> res;
for (auto& m : mem) {
if (!m)
continue;

auto mem_type = m->get_internal_params().mem_type;
if (mem_type == shared_mem_type::shared_mem_vasurface || mem_type == shared_mem_type::shared_mem_dxbuffer) {
res.push_back(static_cast<cl_mem>(m->get_internal_params().mem));
Expand Down
5 changes: 2 additions & 3 deletions inference-engine/thirdparty/clDNN/src/crop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,8 @@ crop_inst::typed_primitive_inst(network& network, crop_node const& node) : paren
ref_in_sizes,
"Invalid Batch offset: exceeds data for output!");

if (node.can_be_optimized()) {
build_deps();
if (node.can_be_optimized() && !node.has_mutable_tensors())
reuse_input();
}
}

void crop_inst::on_execute() {
Expand All @@ -146,6 +144,7 @@ void crop_inst::on_execute() {
}

void crop_inst::reuse_input() {
build_deps();
_output = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout());
}
} // namespace cldnn
18 changes: 10 additions & 8 deletions inference-engine/thirdparty/clDNN/src/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#pragma once
#include "cldnn/primitives/primitive.hpp"
#include "cldnn/primitives/concatenation.hpp"
#include "cldnn/primitives/input_layout.hpp"
#include "cldnn/runtime/error_handler.hpp"
#include "cldnn/runtime/event.hpp"
#include "cldnn/runtime/memory.hpp"
Expand Down Expand Up @@ -77,7 +78,11 @@ class primitive_inst {

memory& dep_memory(size_t index) const { return dependencies().at(index)->output_memory(); }
memory::ptr dep_memory_ptr(size_t index) const { return dependencies().at(index)->output_memory_ptr(); }
memory& output_memory() const { return *_output; }
memory& output_memory() const {
if (!_output)
throw std::runtime_error("[GPU] output is not allocated");
return *_output;
}
memory::ptr output_memory_ptr() const { return _output; }
size_t inputs_memory_count() const { return _node.get_primitive()->input_size(); }
primitive_type_id type() const { return _node.type(); }
Expand Down Expand Up @@ -128,12 +133,8 @@ class primitive_inst {
size_t get_fused_mem_count() const { return _node.get_fused_inputs_count(); }
size_t get_fused_mem_offset() const { return _node.get_fused_primitives()[0].dep_start_idx; }

bool has_mutable_input() const {
return _has_mutable_input;
}

void set_mutable_input(bool val) {
_has_mutable_input = val;
bool has_mutable_tensors() const {
return _node.has_mutable_tensors();
}

bool is_output() const {
Expand Down Expand Up @@ -169,7 +170,6 @@ class primitive_inst {
bool _output_changed; // todo: implement output reuse if neither of inputs has changed
bool _has_valid_input =
true; // by default all primitives has valid inputs, exception is input_layout (see input_layout_inst)
bool _has_mutable_input = false;

memory::ptr allocate_output();
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
Expand Down Expand Up @@ -260,6 +260,8 @@ class typed_primitive_inst_base : public primitive_inst {
typ_node.get_users().front()->can_be_optimized()) { // check if the only user is concat
return false;
}
if (typ_node.template is_type<input_layout>())
return false;
return true;
}
};
Expand Down
4 changes: 4 additions & 0 deletions inference-engine/thirdparty/clDNN/src/include/program_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,9 @@ struct program_node {
bool can_be_optimized() const { return optimized; }
void can_be_optimized(bool opt) { optimized = opt; }

bool has_mutable_tensors() const { return _has_mutable_tensors; }
void set_mutable_tensors(bool val) { _has_mutable_tensors = val; }

// check/set if the node's buffer can be shared during the memory pool optimization
bool can_share_buffer() const { return share_buffer; }
void can_share_buffer(bool share) { share_buffer = share; }
Expand Down Expand Up @@ -337,6 +340,7 @@ struct program_node {
uint8_t user_mark = 0;
bool optimized = false;
bool share_buffer = true;
bool _has_mutable_tensors = false;
std::array<bool, tensor_dim_max> _support_padding_in_axis;

mutable bool has_reused_memory = false;
Expand Down
2 changes: 2 additions & 0 deletions inference-engine/thirdparty/clDNN/src/input_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ void input_layout_inst::set_data(memory::ptr mem) {
if (mem->is_allocated_by(get_network().get_engine())) {
_output = mem;
} else {
if (!_output)
_output = get_network().get_engine().allocate_memory(mem->get_layout(), false);
mem_lock<char> src(mem, get_network().get_stream());
mem_lock<char> dst(_output, get_network().get_stream());
std::copy(src.begin(), src.end(), dst.begin());
Expand Down
5 changes: 1 addition & 4 deletions inference-engine/thirdparty/clDNN/src/loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ void loop_inst::preprocess_input_memory() {
bool is_concatenated_input = (input_map->axis >= 0);
if (is_concatenated_input) {
layout sliced_layout
= body_network->get_primitive(input_map->internal_id)->output_memory().get_layout();
= body_network->get_primitive(input_map->internal_id)->get_node().get_output_layout();
const int64_t max_iteration = node.get_max_iteration();
std::vector<memory::ptr> sliced_mems;
sliced_mems.reserve(max_iteration);
Expand All @@ -367,9 +367,6 @@ void loop_inst::preprocess_input_memory() {
concatenated_input_mem_mapping_info.sliced_data_prim = body_network->get_primitive(input_map->internal_id);
iteration_mem.push_back(concatenated_input_mem_mapping_info);
} else {
if (memory->get_layout().data_type != body_network->get_primitive(input_map->internal_id)->output_memory().get_layout().data_type) {
CLDNN_ERROR_MESSAGE(id(), "incompatible datatypes");
}
body_network->set_input_data(input_map->internal_id, memory);
}
}
Expand Down
31 changes: 15 additions & 16 deletions inference-engine/thirdparty/clDNN/src/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,8 @@ void network::set_arguments() {
return;

for (auto const& prim : _exec_order) {
prim->set_arguments();
if (!prim->has_mutable_tensors())
prim->set_arguments();
}
_reset_arguments = false;
}
Expand Down Expand Up @@ -325,22 +326,26 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr<p
std::vector<std::shared_ptr<primitive_inst>> chain;
std::stack<std::shared_ptr<const primitive_inst>> candidates;
auto& eng = get_engine();
const auto& mem_orig = p_inst->output_memory();
const auto mem_orig = p_inst->output_memory_ptr();

auto add_mdata_chain = [&](std::shared_ptr<primitive_inst>& p_inst) {
auto mdata_ptr = std::dynamic_pointer_cast<mutable_data_inst>(p_inst);
if (!mdata_ptr)
if (!mdata_ptr || !mem_orig)
return;
// special handling for mutable data, which can share
// its attached memory with both its inputs and outputs
for (auto& dep : p_inst->dependencies()) {
if (dep->has_mutable_tensors())
continue;
// check dependencies
if (eng.is_the_same_buffer(mem_orig, dep->output_memory())) {
if (eng.is_the_same_buffer(*mem_orig, dep->output_memory())) {
chain.push_back(std::const_pointer_cast<primitive_inst>(dep));
}
// then second order dependencies
for (auto& second_dep : dep->dependencies()) {
if (eng.is_the_same_buffer(mem_orig, second_dep->output_memory())) {
if (second_dep->has_mutable_tensors())
continue;
if (eng.is_the_same_buffer(*mem_orig, second_dep->output_memory())) {
chain.push_back(std::const_pointer_cast<primitive_inst>(second_dep));
}
}
Expand All @@ -350,7 +355,7 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr<p
const auto& users = p_inst->get_users();
for (const auto& usr : users) {
auto usr_prim = get_primitive(usr->id());
if (eng.is_the_same_buffer(mem_orig, usr_prim->output_memory())) {
if (eng.is_the_same_buffer(*mem_orig, usr_prim->output_memory())) {
chain.push_back(usr_prim);
}
}
Expand All @@ -364,11 +369,11 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr<p
add_mdata_chain(p_inst);

// find all dependencies that are 'optimized'
while (!candidates.empty()) {
while (mem_orig && !candidates.empty()) {
auto& cand = candidates.top();
candidates.pop();
const auto& mem_cand = cand->output_memory();
if (eng.is_the_same_buffer(mem_orig, mem_cand)) {
if (eng.is_the_same_buffer(*mem_orig, mem_cand)) {
auto nc_cand = std::const_pointer_cast<primitive_inst>(cand);
chain.push_back(nc_cand);
add_mdata_chain(nc_cand);
Expand All @@ -379,7 +384,7 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr<p
candidates.push(dep);
} else {
const auto& mem_dep = dep->output_memory();
if (eng.is_the_same_buffer(mem_orig, mem_dep)) {
if (eng.is_the_same_buffer(*mem_orig, mem_dep)) {
auto nc_dep = std::const_pointer_cast<primitive_inst>(dep);
chain.push_back(nc_dep);
add_mdata_chain(nc_dep);
Expand Down Expand Up @@ -554,7 +559,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {

// If a node has mutable input or it's an output, then the input/output buffers might be changed
// So we need to set arguments on each execution.
if (inst->has_mutable_input() || inst->is_output()) {
if (inst->has_mutable_tensors() || inst->is_output()) {
inst->set_arguments();
}
execute_primitive(inst, events);
Expand Down Expand Up @@ -702,12 +707,6 @@ void network::allocate_primitive_instance(program_node const& node) {
return;

auto inst = node.type()->create_instance(*this, node);
for (auto& dep : node.get_dependencies()) {
if (dep->is_type<input_layout>() || dep->is_type<mutable_data>() || dep->can_be_optimized()) {
inst->set_mutable_input(true);
break;
}
}

_primitives[node.id()] = inst;
if (node.is_input())
Expand Down
14 changes: 13 additions & 1 deletion inference-engine/thirdparty/clDNN/src/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,16 @@ void program::init_kernels() {
for (auto& n : get_processing_order()) {
if (n->get_selected_impl())
n->get_selected_impl()->init_kernels();

if (n->is_type<input_layout>() || n->can_be_optimized())
n->set_mutable_tensors(true);

for (auto& dep : n->get_dependencies()) {
if (dep->is_type<input_layout>() || dep->is_type<mutable_data>() || dep->can_be_optimized()) {
n->set_mutable_tensors(true);
break;
}
}
}
}

Expand Down Expand Up @@ -711,8 +721,10 @@ void program::reverse_connection(program_node& dep_node, program_node& user_node

program_node& program::get_or_create(std::shared_ptr<primitive> prim) {
auto itr = nodes_map.lower_bound(prim->id);
if (itr != nodes_map.end() && itr->first == prim->id)
if (itr != nodes_map.end() && itr->first == prim->id) {
std::cerr << "get_or_create: get!\n";
return *itr->second;
}

auto new_node = prim->type->create_node(*this, prim);
nodes_map.insert(itr, {prim->id, new_node});
Expand Down
2 changes: 1 addition & 1 deletion inference-engine/thirdparty/clDNN/src/reorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ std::string reorder_inst::to_string(reorder_node const& node) {

reorder_inst::typed_primitive_inst(network& network, reorder_node const& node)
: parent(network, node, !node.can_be_optimized()) {
if (node.can_be_optimized())
if (node.can_be_optimized() && !node.has_mutable_tensors())
reuse_input();

auto input_layout = node.input().get_output_layout();
Expand Down
2 changes: 1 addition & 1 deletion inference-engine/thirdparty/clDNN/src/reshape.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ reshape_inst::typed_primitive_inst(network& network, reshape_node const& node) :
// then create new memory object as the reinterpreted output of the previous primitive
if (!node.can_be_optimized())
_output = allocate_output();
else
else if (!node.has_mutable_tensors())
reuse_input();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ TEST(memory_pool, basic_non_padded_relu_pipe) {
network.set_input_data("input", input);
auto outputs = network.execute();

EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 64);
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 48);
}

TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
Expand Down Expand Up @@ -118,7 +118,7 @@ TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
network.set_input_data("input", input);
auto outputs = network.execute();

EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)896);
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)640);
}

TEST(memory_pool, multi_outputs_network) {
Expand Down Expand Up @@ -153,7 +153,7 @@ TEST(memory_pool, multi_outputs_network) {
network.set_input_data("input", input);
auto outputs = network.execute();

EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)1536);
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)1280);
}

TEST(memory_pool, oooq) {
Expand Down Expand Up @@ -191,7 +191,7 @@ TEST(memory_pool, oooq) {
network.set_input_data("input", input);
auto outputs = network.execute();

EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2560);
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2304);
}

TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) {
Expand Down Expand Up @@ -396,14 +396,14 @@ TEST(memory_pool, shared_mem_pool_diff_batches) {
auto outputs = network_first.execute();

auto dev_info = engine->get_device_info();
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)3928);
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)2392);

topo.change_input_layout("input", input_1->get_layout());//change input layout to batch=1

network network_second(*engine, topo, bo);
network_second.set_input_data("input", input_1);
auto outputs_second = network_second.execute();
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)3928);
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)2808);
}

TEST(memory_pool, shared_dep_two_output) {
Expand Down Expand Up @@ -449,7 +449,7 @@ TEST(memory_pool, shared_dep_two_output) {

network network(*engine, topo, bo);
auto outputs = network.execute();
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)256);
EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)192);
}

TEST(memory_pool, non_opt_intermidate_opt_after) {
Expand Down

0 comments on commit a3e7466

Please sign in to comment.