Skip to content

Commit

Permalink
[IE CLDNN] Add optional memory access type (openvinotoolkit#7)
Browse files Browse the repository at this point in the history
Co-authored-by: Sergey Shlyapnikov <[email protected]>
  • Loading branch information
2 people authored and Lyamin-Roman committed Sep 24, 2021
1 parent 0efc1a0 commit 0405656
Show file tree
Hide file tree
Showing 17 changed files with 99 additions and 80 deletions.
14 changes: 10 additions & 4 deletions inference-engine/thirdparty/clDNN/api/cldnn/runtime/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,19 @@ namespace cldnn {
class engine;
class stream;

enum class mem_lock_type : int32_t {
read,
write,
read_write
};

struct memory {
using ptr = std::shared_ptr<memory>;
using cptr = std::shared_ptr<const memory>;
memory(engine* engine, const layout& layout, allocation_type type, bool reused = false);

virtual ~memory();
virtual void* lock(const stream& stream) = 0;
virtual void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) = 0;
virtual void unlock(const stream& stream) = 0;
virtual event::ptr fill(stream& stream, unsigned char pattern) = 0;
virtual event::ptr fill(stream& stream) = 0;
Expand Down Expand Up @@ -83,7 +89,7 @@ struct simple_attached_memory : memory {
simple_attached_memory(const layout& layout, void* pointer)
: memory(nullptr, layout, allocation_type::unknown), _pointer(pointer) {}

void* lock(const stream& /* stream */) override { return _pointer; }
void* lock(const stream& /* stream */, mem_lock_type /* type */) override { return _pointer; }
void unlock(const stream& /* stream */) override {}
event::ptr fill(stream& /* stream */, unsigned char) override { return nullptr; }
event::ptr fill(stream& /* stream */) override { return nullptr; }
Expand All @@ -102,9 +108,9 @@ struct simple_attached_memory : memory {
void* _pointer;
};

template <class T>
template <class T, mem_lock_type lock_type = mem_lock_type::read_write>
struct mem_lock {
explicit mem_lock(memory::ptr mem, const stream& stream) : _mem(mem), _stream(stream), _ptr(reinterpret_cast<T*>(_mem->lock(_stream))) {}
explicit mem_lock(memory::ptr mem, const stream& stream) : _mem(mem), _stream(stream), _ptr(reinterpret_cast<T*>(_mem->lock(_stream, lock_type))) {}

~mem_lock() {
_ptr = nullptr;
Expand Down
23 changes: 18 additions & 5 deletions inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,19 @@
namespace cldnn {
namespace ocl {

static int get_cl_map_type(mem_lock_type type) {
switch (type) {
case mem_lock_type::read:
return CL_MAP_READ;
case mem_lock_type::write:
return CL_MAP_WRITE;
case mem_lock_type::read_write:
return CL_MAP_READ | CL_MAP_WRITE;
default:
throw std::runtime_error("Unsupported lock type for cl_memory buffer\n");
}
}

gpu_buffer::gpu_buffer(ocl_engine* engine,
const layout& layout)
: lockable_gpu_mem(), memory(engine, layout, allocation_type::cl_mem, false)
Expand All @@ -29,11 +42,11 @@ gpu_buffer::gpu_buffer(ocl_engine* engine,
: lockable_gpu_mem(), memory(engine, new_layout, allocation_type::cl_mem, true)
, _buffer(buffer) {}

void* gpu_buffer::lock(const stream& stream) {
void* gpu_buffer::lock(const stream& stream, mem_lock_type type) {
auto& cl_stream = downcast<const ocl_stream>(stream);
std::lock_guard<std::mutex> locker(_mutex);
if (0 == _lock_count) {
_mapped_ptr = cl_stream.get_cl_queue().enqueueMapBuffer(_buffer, CL_TRUE, CL_MAP_WRITE, 0, size());
_mapped_ptr = cl_stream.get_cl_queue().enqueueMapBuffer(_buffer, CL_TRUE, get_cl_map_type(type), 0, size());
}
_lock_count++;
return _mapped_ptr;
Expand Down Expand Up @@ -182,14 +195,14 @@ event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern) {
return ev;
}

void* gpu_image2d::lock(const stream& stream) {
void* gpu_image2d::lock(const stream& stream, mem_lock_type type) {
auto& cl_stream = downcast<const ocl_stream>(stream);
std::lock_guard<std::mutex> locker(_mutex);
if (0 == _lock_count) {
_mapped_ptr = cl_stream.get_cl_queue()
.enqueueMapImage(_buffer,
CL_TRUE,
CL_MAP_WRITE,
get_cl_map_type(type),
{0, 0, 0},
{_width, _height, 1},
&_row_pitch,
Expand Down Expand Up @@ -286,7 +299,7 @@ gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type)
}
}

void* gpu_usm::lock(const stream& stream) {
void* gpu_usm::lock(const stream& stream, mem_lock_type /*type*/) {
assert(get_allocation_type() != allocation_type::usm_device && "Can't lock usm device memory!");
std::lock_guard<std::mutex> locker(_mutex);
if (0 == _lock_count) {
Expand Down
6 changes: 3 additions & 3 deletions inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ struct gpu_buffer : public lockable_gpu_mem, public memory {
gpu_buffer(ocl_engine* engine, const layout& new_layout, const cl::Buffer& buffer);
gpu_buffer(ocl_engine* engine, const layout& layout);

void* lock(const stream& stream) override;
void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
void unlock(const stream& stream) override;
event::ptr fill(stream& stream, unsigned char pattern) override;
event::ptr fill(stream& stream) override;
Expand All @@ -54,7 +54,7 @@ struct gpu_image2d : public lockable_gpu_mem, public memory {
gpu_image2d(ocl_engine* engine, const layout& new_layout, const cl::Image2D& buffer);
gpu_image2d(ocl_engine* engine, const layout& layout);

void* lock(const stream& stream) override;
void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
void unlock(const stream& stream) override;
event::ptr fill(stream& stream, unsigned char pattern) override;
event::ptr fill(stream& stream) override;
Expand Down Expand Up @@ -102,7 +102,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer, allocation_type type);
gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type);

void* lock(const stream& stream) override;
void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
void unlock(const stream& stream) override;
const cl::UsmMemory& get_buffer() const { return _buffer; }
cl::UsmMemory& get_buffer() { return _buffer; }
Expand Down
4 changes: 2 additions & 2 deletions inference-engine/thirdparty/clDNN/src/data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ memory::ptr attach_or_copy_data(network& network, memory::ptr mem) {
return mem;

memory::ptr result = engine.allocate_memory(mem->get_layout(), false);
mem_lock<char> src(mem, network.get_stream());
mem_lock<char> dst(result, network.get_stream());
mem_lock<char, mem_lock_type::read> src(mem, network.get_stream());
mem_lock<char, mem_lock_type::write> dst(result, network.get_stream());
std::copy(src.begin(), src.end(), dst.begin());
return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ void shuffle_weights(data_node& node, const std::vector<shuffle_range>& ranges,
auto new_weights_memory = old_weights_memory->get_engine()->allocate_memory(wei_layout, old_weights_memory->get_allocation_type(), need_reset);

auto bytes_per_elem = data_type_traits::size_of(wei_layout.data_type);
mem_lock<uint8_t> old_weights_memory_lock{old_weights_memory, stream};
mem_lock<uint8_t> new_weights_memory_lock{new_weights_memory, stream};
mem_lock<uint8_t, mem_lock_type::read> old_weights_memory_lock{old_weights_memory, stream};
mem_lock<uint8_t, mem_lock_type::write> new_weights_memory_lock{new_weights_memory, stream};
auto old_ptr = old_weights_memory_lock.data();
auto new_ptr = new_weights_memory_lock.data();
for (int32_t ofi = 0; ofi < wei_layout.size.batch[0]; ++ofi) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,11 +221,11 @@ void pre_replace_deconv::run(program& p) {
std::vector<float> weights_vec_float;

if (weights_data_type == data_types::f16) {
mem_lock<half_t> src{ weights_node_ptr->as<data>().get_attached_memory_ptr(), stream };
mem_lock<half_t, mem_lock_type::read> src{ weights_node_ptr->as<data>().get_attached_memory_ptr(), stream };
for (uint32_t i = 0; i < weights_layout.size.count(); i++)
weights_vec_float.push_back(static_cast<float>(src.data()[i]));
} else {
mem_lock<float> src{ weights_node_ptr->as<data>().get_attached_memory_ptr(), stream };
mem_lock<float, mem_lock_type::read> src{ weights_node_ptr->as<data>().get_attached_memory_ptr(), stream };
for (uint32_t i = 0; i < weights_layout.size.count(); i++)
weights_vec_float.push_back(src.data()[i]);
}
Expand All @@ -240,10 +240,10 @@ void pre_replace_deconv::run(program& p) {
subpixel_weights);

if (weights_data_type == data_types::f16) {
mem_lock<half_t> dst{ data_to_allocate, stream};
mem_lock<half_t, mem_lock_type::write> dst{ data_to_allocate, stream};
program_helpers::set_weights_values<half_t>(dst.data(), subpixel_weights);
} else if (weights_data_type == data_types::f32) {
mem_lock<float> dst{ data_to_allocate, stream };
mem_lock<float, mem_lock_type::write> dst{ data_to_allocate, stream };
program_helpers::set_weights_values<float>(dst.data(), subpixel_weights);
} else {
throw std::logic_error("Not supported data type.");
Expand Down Expand Up @@ -283,10 +283,10 @@ void pre_replace_deconv::run(program& p) {
float bias = 0;

if (bias_data_type == data_types::f16) {
mem_lock<half_t> src{ bias_id_node_ptr->as<data>().get_attached_memory_ptr(), stream };
mem_lock<half_t, mem_lock_type::read> src{ bias_id_node_ptr->as<data>().get_attached_memory_ptr(), stream };
bias = static_cast<float>(src.data()[0]);
} else {
mem_lock<float> src{ bias_id_node_ptr->as<data>().get_attached_memory_ptr(), stream };
mem_lock<float, mem_lock_type::read> src{ bias_id_node_ptr->as<data>().get_attached_memory_ptr(), stream };
bias = src.data()[0];
}
auto pixel_shuffle_prim = std::make_shared<depth_to_space>(deconv_node_id, deconv_id_conv, 2, depth_to_space_mode::blocks_first);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ template<typename T>
bool check_binarization(memory::ptr mem_input_low, memory::ptr mem_input_high, program& p) {
bool is_binarization = true;
const auto& stream = p.get_stream();
mem_lock<T> data_input_low_lock{mem_input_low, stream};
mem_lock<T> data_input_high_lock{mem_input_high, stream};
mem_lock<T, mem_lock_type::read> data_input_low_lock{mem_input_low, stream};
mem_lock<T, mem_lock_type::read> data_input_high_lock{mem_input_high, stream};
auto data_input_low = data_input_low_lock.data();
auto data_input_high = data_input_high_lock.data();
const size_t number_mem_layout_elements = mem_input_high->get_layout().count();
Expand Down Expand Up @@ -85,26 +85,26 @@ void prepare_quantization::prepare_scale_shift_opt(program &p, quantize_node& q
std::function<float(size_t)>& get_data) {
switch (memory->get_layout().data_type) {
case data_types::f32: {
std::shared_ptr<mem_lock<float>> data_lock_ptr = std::make_shared<mem_lock<float>>(memory, stream);
std::shared_ptr<mem_lock<float, mem_lock_type::write>> data_lock_ptr = std::make_shared<mem_lock<float, mem_lock_type::write>>(memory, stream);
float* data = data_lock_ptr->data();
set_data = [data] (size_t idx, float value) {
data[idx] = value;
};
get_data = [data] (size_t idx) {
return data[idx];
};
return std::pair<std::shared_ptr<mem_lock<float>>, std::shared_ptr<mem_lock<uint16_t>>>(data_lock_ptr, nullptr);
return std::pair<std::shared_ptr<mem_lock<float, mem_lock_type::write>>, std::shared_ptr<mem_lock<uint16_t, mem_lock_type::write>>>(data_lock_ptr, nullptr);
}
case data_types::f16: {
std::shared_ptr<mem_lock<uint16_t>> data_lock_ptr = std::make_shared<mem_lock<uint16_t>>(memory, stream);
std::shared_ptr<mem_lock<uint16_t, mem_lock_type::write>> data_lock_ptr = std::make_shared<mem_lock<uint16_t, mem_lock_type::write>>(memory, stream);
uint16_t* data = data_lock_ptr->data();
set_data = [data] (size_t idx, float value) {
data[idx] = float_to_half(value);
};
get_data = [data] (size_t idx) {
return half_to_float(data[idx]);
};
return std::pair<std::shared_ptr<mem_lock<float>>, std::shared_ptr<mem_lock<uint16_t>>>(nullptr, data_lock_ptr);
return std::pair<std::shared_ptr<mem_lock<float, mem_lock_type::write>>, std::shared_ptr<mem_lock<uint16_t, mem_lock_type::write>>>(nullptr, data_lock_ptr);
}
default:
throw std::runtime_error("prepare_quantization: Unsupported precision of quantize output values");
Expand Down Expand Up @@ -378,8 +378,8 @@ void prepare_quantization::prepare_dequantize_merge(program& p, eltwise_node& el
auto mem0 = get_scale_shift_mem(eltwise_dep, i);
auto mem1 = get_scale_shift_mem(eltwise_node, i);

mem_lock<uint8_t> mem0_lock{mem0, stream};
mem_lock<uint8_t> mem1_lock{mem1, stream};
mem_lock<uint8_t, mem_lock_type::read> mem0_lock{mem0, stream};
mem_lock<uint8_t, mem_lock_type::read> mem1_lock{mem1, stream};
auto ptr0 = mem0_lock.data();
auto ptr1 = mem1_lock.data();

Expand Down Expand Up @@ -487,40 +487,40 @@ void prepare_quantization::prepare_asymmetric_quantization(program &p, convoluti
const auto& w_dt = wl.data_type;
const auto& azp_dt = azp->get_layout().data_type;

mem_lock<float> comp_lock{compensation, stream};
mem_lock<float, mem_lock_type::write> comp_lock{compensation, stream};

if (w_dt == data_types::u8 && azp_dt == data_types::u8) {
mem_lock<uint8_t> w_lock(w, stream);
mem_lock<uint8_t> azp_lock(azp, stream);
mem_lock<uint8_t, mem_lock_type::read> w_lock(w, stream);
mem_lock<uint8_t, mem_lock_type::read> azp_lock(azp, stream);
if (wzp) {
mem_lock<uint8_t> wzp_lock(wzp, stream);
mem_lock<uint8_t, mem_lock_type::read> wzp_lock(wzp, stream);
fill_compensation_typed(w_lock.data(), azp_lock.data(), wzp_lock.data(), comp_lock.data(), GS, OC, IC, KS);
} else {
fill_compensation_typed(w_lock.data(), azp_lock.data(), static_cast<uint8_t*>(nullptr), comp_lock.data(), GS, OC, IC, KS);
}
} else if (w_dt == data_types::i8 && azp_dt == data_types::u8) {
mem_lock<int8_t> w_lock(w, stream);
mem_lock<uint8_t> azp_lock(azp, stream);
mem_lock<int8_t, mem_lock_type::read> w_lock(w, stream);
mem_lock<uint8_t, mem_lock_type::read> azp_lock(azp, stream);
if (wzp) {
mem_lock<int8_t> wzp_lock(wzp, stream);
mem_lock<int8_t, mem_lock_type::read> wzp_lock(wzp, stream);
fill_compensation_typed(w_lock.data(), azp_lock.data(), wzp_lock.data(), comp_lock.data(), GS, OC, IC, KS);
} else {
fill_compensation_typed(w_lock.data(), azp_lock.data(), static_cast<int8_t*>(nullptr), comp_lock.data(), GS, OC, IC, KS);
}
} else if (w_dt == data_types::i8 && azp_dt == data_types::i8) {
mem_lock<int8_t> w_lock(w, stream);
mem_lock<int8_t> azp_lock(azp, stream);
mem_lock<int8_t, mem_lock_type::read> w_lock(w, stream);
mem_lock<int8_t, mem_lock_type::read> azp_lock(azp, stream);
if (wzp) {
mem_lock<int8_t> wzp_lock(wzp, stream);
mem_lock<int8_t, mem_lock_type::read> wzp_lock(wzp, stream);
fill_compensation_typed(w_lock.data(), azp_lock.data(), wzp_lock.data(), comp_lock.data(), GS, OC, IC, KS);
} else {
fill_compensation_typed(w_lock.data(), azp_lock.data(), static_cast<int8_t*>(nullptr), comp_lock.data(), GS, OC, IC, KS);
}
} else if (w_dt == data_types::u8 && azp_dt == data_types::i8) {
mem_lock<uint8_t> w_lock(w, stream);
mem_lock<int8_t> azp_lock(azp, stream);
mem_lock<uint8_t, mem_lock_type::read> w_lock(w, stream);
mem_lock<int8_t, mem_lock_type::read> azp_lock(azp, stream);
if (wzp) {
mem_lock<uint8_t> wzp_lock(wzp, stream);
mem_lock<uint8_t, mem_lock_type::read> wzp_lock(wzp, stream);
fill_compensation_typed(w_lock.data(), azp_lock.data(), wzp_lock.data(), comp_lock.data(), GS, OC, IC, KS);
} else {
fill_compensation_typed(w_lock.data(), azp_lock.data(), static_cast<uint8_t*>(nullptr), comp_lock.data(), GS, OC, IC, KS);
Expand Down Expand Up @@ -579,8 +579,8 @@ void prepare_quantization::prepare_asymmetric_quantization(program &p, convoluti
int s = new_a_zp->get_output_layout().size.feature[0];
auto azp_aligned = p.get_engine().allocate_memory(l);
auto old_ptr = new_a_zp->as<data>().get_attached_memory_ptr();
mem_lock<int8_t> new_data{azp_aligned, stream};
mem_lock<int8_t> old_data{old_ptr, stream};
mem_lock<int8_t, mem_lock_type::write> new_data{azp_aligned, stream};
mem_lock<int8_t, mem_lock_type::read> old_data{old_ptr, stream};
for (int i = 0; i < ifm_aligned; i++) {
new_data.data()[i] = old_data.data()[i % s];
}
Expand All @@ -602,8 +602,8 @@ void prepare_quantization::prepare_asymmetric_quantization(program &p, convoluti
int s = new_w_zp->get_output_layout().size.batch[0];
auto wzp_aligned = p.get_engine().allocate_memory(l);
auto old_ptr = new_w_zp->as<data>().get_attached_memory_ptr();
mem_lock<int8_t> new_data{wzp_aligned, stream};
mem_lock<int8_t> old_data{old_ptr, stream};
mem_lock<int8_t, mem_lock_type::write> new_data{wzp_aligned, stream};
mem_lock<int8_t, mem_lock_type::read> old_data{old_ptr, stream};
for (int i = 0; i < ofm_aligned; i++) {
new_data.data()[i] = old_data.data()[i % s];
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ struct condition_impl : typed_primitive_impl<condition> {
else
memory_to_copy = execute_branch(instance.get_net_false(), instance.result_id(), instance.input_memory_ptr());
// just copy memory
mem_lock<float> inp_ptr{memory_to_copy, instance.get_network().get_stream()};
mem_lock<float> out_ptr{instance.output_memory_ptr(), instance.get_network().get_stream()};
mem_lock<float, mem_lock_type::read> inp_ptr{memory_to_copy, instance.get_network().get_stream()};
mem_lock<float, mem_lock_type::write> out_ptr{instance.output_memory_ptr(), instance.get_network().get_stream()};
std::copy(inp_ptr.begin(), inp_ptr.end(), out_ptr.begin());
ev->set();
return ev;
Expand Down Expand Up @@ -71,11 +71,11 @@ struct condition_impl : typed_primitive_impl<condition> {
Returns boolean flag, which says what branch should be executed.
*/
bool choose_branch_to_exec(condition_inst& instance) const {
mem_lock<float> lock_compare_data{instance.compare_memory_ptr(), instance.get_network().get_stream()};
mem_lock<float, mem_lock_type::read> lock_compare_data{instance.compare_memory_ptr(), instance.get_network().get_stream()};
auto compare_layout = instance.compare_memory().get_layout();
auto compare_ptr = lock_compare_data.begin();

mem_lock<float> lock_input{instance.input_memory_ptr(), instance.get_network().get_stream()};
mem_lock<float, mem_lock_type::read> lock_input{instance.input_memory_ptr(), instance.get_network().get_stream()};
auto input_layout = instance.input_memory().get_layout();
auto input_ptr = lock_input.begin();

Expand Down
Loading

0 comments on commit 0405656

Please sign in to comment.