[IE CLDNN] Add optional memory access type (openvinotoolkit#7)

Co-authored-by: Sergey Shlyapnikov <[email protected]>
Lyamin-Roman · Sep 24, 2021 · 0405656 · 0405656
1 parent 0efc1a0
commit 0405656
Show file tree

Hide file tree

Showing 17 changed files with 99 additions and 80 deletions.
diff --git a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/memory.hpp b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/memory.hpp
@@ -18,13 +18,19 @@ namespace cldnn {
 class engine;
 class stream;
 
+enum class mem_lock_type : int32_t {
+    read,
+    write,
+    read_write
+};
+
 struct memory {
     using ptr = std::shared_ptr<memory>;
     using cptr = std::shared_ptr<const memory>;
     memory(engine* engine, const layout& layout,  allocation_type type, bool reused = false);
 
     virtual ~memory();
-    virtual void* lock(const stream& stream) = 0;
+    virtual void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) = 0;
     virtual void unlock(const stream& stream) = 0;
     virtual event::ptr fill(stream& stream, unsigned char pattern) = 0;
     virtual event::ptr fill(stream& stream) = 0;
@@ -83,7 +89,7 @@ struct simple_attached_memory : memory {
     simple_attached_memory(const layout& layout, void* pointer)
         : memory(nullptr, layout, allocation_type::unknown), _pointer(pointer) {}
 
-    void* lock(const stream& /* stream */) override { return _pointer; }
+    void* lock(const stream& /* stream */, mem_lock_type /* type */) override { return _pointer; }
     void unlock(const stream& /* stream */) override {}
     event::ptr fill(stream& /* stream */, unsigned char) override { return nullptr; }
     event::ptr fill(stream& /* stream */) override { return nullptr; }
@@ -102,9 +108,9 @@ struct simple_attached_memory : memory {
     void* _pointer;
 };
 
-template <class T>
+template <class T, mem_lock_type lock_type = mem_lock_type::read_write>
 struct mem_lock {
-    explicit mem_lock(memory::ptr mem, const stream& stream) : _mem(mem), _stream(stream), _ptr(reinterpret_cast<T*>(_mem->lock(_stream))) {}
+    explicit mem_lock(memory::ptr mem, const stream& stream) : _mem(mem), _stream(stream), _ptr(reinterpret_cast<T*>(_mem->lock(_stream, lock_type))) {}
 
     ~mem_lock() {
         _ptr = nullptr;

diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.cpp
@@ -18,6 +18,19 @@
 namespace cldnn {
 namespace ocl {
 
+static int get_cl_map_type(mem_lock_type type) {
+    switch (type) {
+        case mem_lock_type::read:
+            return CL_MAP_READ;
+        case mem_lock_type::write:
+            return CL_MAP_WRITE;
+        case mem_lock_type::read_write:
+            return CL_MAP_READ | CL_MAP_WRITE;
+        default:
+            throw std::runtime_error("Unsupported lock type for cl_memory buffer\n");
+    }
+}
+
 gpu_buffer::gpu_buffer(ocl_engine* engine,
                        const layout& layout)
     : lockable_gpu_mem(), memory(engine, layout, allocation_type::cl_mem, false)
@@ -29,11 +42,11 @@ gpu_buffer::gpu_buffer(ocl_engine* engine,
     : lockable_gpu_mem(), memory(engine, new_layout, allocation_type::cl_mem, true)
     , _buffer(buffer) {}
 
-void* gpu_buffer::lock(const stream& stream) {
+void* gpu_buffer::lock(const stream& stream, mem_lock_type type) {
     auto& cl_stream = downcast<const ocl_stream>(stream);
     std::lock_guard<std::mutex> locker(_mutex);
     if (0 == _lock_count) {
-        _mapped_ptr = cl_stream.get_cl_queue().enqueueMapBuffer(_buffer, CL_TRUE, CL_MAP_WRITE, 0, size());
+        _mapped_ptr = cl_stream.get_cl_queue().enqueueMapBuffer(_buffer, CL_TRUE, get_cl_map_type(type), 0, size());
     }
     _lock_count++;
     return _mapped_ptr;
@@ -182,14 +195,14 @@ event::ptr gpu_image2d::fill(stream& stream, unsigned char pattern) {
     return ev;
 }
 
-void* gpu_image2d::lock(const stream& stream) {
+void* gpu_image2d::lock(const stream& stream, mem_lock_type type) {
     auto& cl_stream = downcast<const ocl_stream>(stream);
     std::lock_guard<std::mutex> locker(_mutex);
     if (0 == _lock_count) {
         _mapped_ptr = cl_stream.get_cl_queue()
                           .enqueueMapImage(_buffer,
                                            CL_TRUE,
-                                           CL_MAP_WRITE,
+                                           get_cl_map_type(type),
                                            {0, 0, 0},
                                            {_width, _height, 1},
                                            &_row_pitch,
@@ -286,7 +299,7 @@ gpu_usm::gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type)
     }
 }
 
-void* gpu_usm::lock(const stream& stream) {
+void* gpu_usm::lock(const stream& stream, mem_lock_type /*type*/) {
     assert(get_allocation_type() != allocation_type::usm_device && "Can't lock usm device memory!");
     std::lock_guard<std::mutex> locker(_mutex);
     if (0 == _lock_count) {

diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_memory.hpp
@@ -30,7 +30,7 @@ struct gpu_buffer : public lockable_gpu_mem, public memory {
     gpu_buffer(ocl_engine* engine, const layout& new_layout, const cl::Buffer& buffer);
     gpu_buffer(ocl_engine* engine, const layout& layout);
 
-    void* lock(const stream& stream) override;
+    void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
     void unlock(const stream& stream) override;
     event::ptr fill(stream& stream, unsigned char pattern) override;
     event::ptr fill(stream& stream) override;
@@ -54,7 +54,7 @@ struct gpu_image2d : public lockable_gpu_mem, public memory {
     gpu_image2d(ocl_engine* engine, const layout& new_layout, const cl::Image2D& buffer);
     gpu_image2d(ocl_engine* engine, const layout& layout);
 
-    void* lock(const stream& stream) override;
+    void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
     void unlock(const stream& stream) override;
     event::ptr fill(stream& stream, unsigned char pattern) override;
     event::ptr fill(stream& stream) override;
@@ -102,7 +102,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
     gpu_usm(ocl_engine* engine, const layout& new_layout, const cl::UsmMemory& usm_buffer, allocation_type type);
     gpu_usm(ocl_engine* engine, const layout& layout, allocation_type type);
 
-    void* lock(const stream& stream) override;
+    void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override;
     void unlock(const stream& stream) override;
     const cl::UsmMemory& get_buffer() const { return _buffer; }
     cl::UsmMemory& get_buffer() { return _buffer; }

diff --git a/inference-engine/thirdparty/clDNN/src/data.cpp b/inference-engine/thirdparty/clDNN/src/data.cpp
@@ -25,8 +25,8 @@ memory::ptr attach_or_copy_data(network& network, memory::ptr mem) {
         return mem;
 
     memory::ptr result = engine.allocate_memory(mem->get_layout(), false);
-    mem_lock<char> src(mem, network.get_stream());
-    mem_lock<char> dst(result, network.get_stream());
+    mem_lock<char, mem_lock_type::read> src(mem, network.get_stream());
+    mem_lock<char, mem_lock_type::write> dst(result, network.get_stream());
     std::copy(src.begin(), src.end(), dst.begin());
     return result;
 }

diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/concat_input_order.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/concat_input_order.cpp
@@ -64,8 +64,8 @@ void shuffle_weights(data_node& node, const std::vector<shuffle_range>& ranges,
     auto new_weights_memory = old_weights_memory->get_engine()->allocate_memory(wei_layout, old_weights_memory->get_allocation_type(), need_reset);
 
     auto bytes_per_elem = data_type_traits::size_of(wei_layout.data_type);
-    mem_lock<uint8_t> old_weights_memory_lock{old_weights_memory, stream};
-    mem_lock<uint8_t> new_weights_memory_lock{new_weights_memory, stream};
+    mem_lock<uint8_t, mem_lock_type::read> old_weights_memory_lock{old_weights_memory, stream};
+    mem_lock<uint8_t, mem_lock_type::write> new_weights_memory_lock{new_weights_memory, stream};
     auto old_ptr = old_weights_memory_lock.data();
     auto new_ptr = new_weights_memory_lock.data();
     for (int32_t ofi = 0; ofi < wei_layout.size.batch[0]; ++ofi) {

diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_replace_deconv.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_replace_deconv.cpp
@@ -221,11 +221,11 @@ void pre_replace_deconv::run(program& p) {
                      std::vector<float> weights_vec_float;
 
                      if (weights_data_type == data_types::f16) {
-                         mem_lock<half_t> src{ weights_node_ptr->as<data>().get_attached_memory_ptr(), stream };
+                         mem_lock<half_t, mem_lock_type::read> src{ weights_node_ptr->as<data>().get_attached_memory_ptr(), stream };
                          for (uint32_t i = 0; i < weights_layout.size.count(); i++)
                              weights_vec_float.push_back(static_cast<float>(src.data()[i]));
                      } else {
-                         mem_lock<float> src{ weights_node_ptr->as<data>().get_attached_memory_ptr(), stream };
+                         mem_lock<float, mem_lock_type::read> src{ weights_node_ptr->as<data>().get_attached_memory_ptr(), stream };
                          for (uint32_t i = 0; i < weights_layout.size.count(); i++)
                              weights_vec_float.push_back(src.data()[i]);
                      }
@@ -240,10 +240,10 @@ void pre_replace_deconv::run(program& p) {
                          subpixel_weights);
 
                      if (weights_data_type == data_types::f16) {
-                         mem_lock<half_t> dst{ data_to_allocate, stream};
+                         mem_lock<half_t, mem_lock_type::write> dst{ data_to_allocate, stream};
                          program_helpers::set_weights_values<half_t>(dst.data(), subpixel_weights);
                      } else if (weights_data_type == data_types::f32) {
-                         mem_lock<float> dst{ data_to_allocate, stream };
+                         mem_lock<float, mem_lock_type::write> dst{ data_to_allocate, stream };
                          program_helpers::set_weights_values<float>(dst.data(), subpixel_weights);
                      } else {
                          throw std::logic_error("Not supported data type.");
@@ -283,10 +283,10 @@ void pre_replace_deconv::run(program& p) {
                 float bias = 0;
 
                 if (bias_data_type == data_types::f16) {
-                    mem_lock<half_t> src{ bias_id_node_ptr->as<data>().get_attached_memory_ptr(), stream };
+                    mem_lock<half_t, mem_lock_type::read> src{ bias_id_node_ptr->as<data>().get_attached_memory_ptr(), stream };
                     bias = static_cast<float>(src.data()[0]);
                 } else {
-                    mem_lock<float> src{ bias_id_node_ptr->as<data>().get_attached_memory_ptr(), stream };
+                    mem_lock<float, mem_lock_type::read> src{ bias_id_node_ptr->as<data>().get_attached_memory_ptr(), stream };
                     bias = src.data()[0];
                 }
                 auto pixel_shuffle_prim = std::make_shared<depth_to_space>(deconv_node_id, deconv_id_conv, 2, depth_to_space_mode::blocks_first);

diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_quantization.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_quantization.cpp
@@ -24,8 +24,8 @@ template<typename T>
 bool check_binarization(memory::ptr mem_input_low, memory::ptr mem_input_high, program& p) {
     bool is_binarization = true;
     const auto& stream = p.get_stream();
-    mem_lock<T> data_input_low_lock{mem_input_low, stream};
-    mem_lock<T> data_input_high_lock{mem_input_high, stream};
+    mem_lock<T, mem_lock_type::read> data_input_low_lock{mem_input_low, stream};
+    mem_lock<T, mem_lock_type::read> data_input_high_lock{mem_input_high, stream};
     auto data_input_low = data_input_low_lock.data();
     auto data_input_high = data_input_high_lock.data();
     const size_t number_mem_layout_elements = mem_input_high->get_layout().count();
@@ -85,26 +85,26 @@ void  prepare_quantization::prepare_scale_shift_opt(program &p, quantize_node& q
                                   std::function<float(size_t)>& get_data) {
         switch (memory->get_layout().data_type) {
             case data_types::f32: {
-                std::shared_ptr<mem_lock<float>> data_lock_ptr = std::make_shared<mem_lock<float>>(memory, stream);
+                std::shared_ptr<mem_lock<float, mem_lock_type::write>> data_lock_ptr = std::make_shared<mem_lock<float, mem_lock_type::write>>(memory, stream);
                 float* data = data_lock_ptr->data();
                 set_data = [data] (size_t idx, float value) {
                     data[idx] = value;
                 };
                 get_data = [data] (size_t idx) {
                     return data[idx];
                 };
-                return std::pair<std::shared_ptr<mem_lock<float>>, std::shared_ptr<mem_lock<uint16_t>>>(data_lock_ptr, nullptr);
+                return std::pair<std::shared_ptr<mem_lock<float, mem_lock_type::write>>, std::shared_ptr<mem_lock<uint16_t, mem_lock_type::write>>>(data_lock_ptr, nullptr);
             }
             case data_types::f16: {
-                std::shared_ptr<mem_lock<uint16_t>> data_lock_ptr = std::make_shared<mem_lock<uint16_t>>(memory, stream);
+                std::shared_ptr<mem_lock<uint16_t, mem_lock_type::write>> data_lock_ptr = std::make_shared<mem_lock<uint16_t, mem_lock_type::write>>(memory, stream);
                 uint16_t* data = data_lock_ptr->data();
                 set_data = [data] (size_t idx, float value) {
                     data[idx] = float_to_half(value);
                 };
                 get_data = [data] (size_t idx) {
                     return half_to_float(data[idx]);
                 };
-                return std::pair<std::shared_ptr<mem_lock<float>>, std::shared_ptr<mem_lock<uint16_t>>>(nullptr, data_lock_ptr);
+                return std::pair<std::shared_ptr<mem_lock<float, mem_lock_type::write>>, std::shared_ptr<mem_lock<uint16_t, mem_lock_type::write>>>(nullptr, data_lock_ptr);
             }
             default:
                 throw std::runtime_error("prepare_quantization: Unsupported precision of quantize output values");
@@ -378,8 +378,8 @@ void prepare_quantization::prepare_dequantize_merge(program& p, eltwise_node& el
             auto mem0 = get_scale_shift_mem(eltwise_dep, i);
             auto mem1 = get_scale_shift_mem(eltwise_node, i);
 
-            mem_lock<uint8_t> mem0_lock{mem0, stream};
-            mem_lock<uint8_t> mem1_lock{mem1, stream};
+            mem_lock<uint8_t, mem_lock_type::read> mem0_lock{mem0, stream};
+            mem_lock<uint8_t, mem_lock_type::read> mem1_lock{mem1, stream};
             auto ptr0 = mem0_lock.data();
             auto ptr1 = mem1_lock.data();
 
@@ -487,40 +487,40 @@ void prepare_quantization::prepare_asymmetric_quantization(program &p, convoluti
         const auto& w_dt = wl.data_type;
         const auto& azp_dt = azp->get_layout().data_type;
 
-        mem_lock<float> comp_lock{compensation, stream};
+        mem_lock<float, mem_lock_type::write> comp_lock{compensation, stream};
 
         if (w_dt == data_types::u8 && azp_dt == data_types::u8) {
-            mem_lock<uint8_t> w_lock(w, stream);
-            mem_lock<uint8_t> azp_lock(azp, stream);
+            mem_lock<uint8_t, mem_lock_type::read> w_lock(w, stream);
+            mem_lock<uint8_t, mem_lock_type::read> azp_lock(azp, stream);
             if (wzp) {
-                mem_lock<uint8_t> wzp_lock(wzp, stream);
+                mem_lock<uint8_t, mem_lock_type::read> wzp_lock(wzp, stream);
                 fill_compensation_typed(w_lock.data(), azp_lock.data(), wzp_lock.data(), comp_lock.data(), GS, OC, IC, KS);
             } else {
                 fill_compensation_typed(w_lock.data(), azp_lock.data(), static_cast<uint8_t*>(nullptr), comp_lock.data(), GS, OC, IC, KS);
             }
         } else if (w_dt == data_types::i8 && azp_dt == data_types::u8) {
-            mem_lock<int8_t> w_lock(w, stream);
-            mem_lock<uint8_t> azp_lock(azp, stream);
+            mem_lock<int8_t, mem_lock_type::read> w_lock(w, stream);
+            mem_lock<uint8_t, mem_lock_type::read> azp_lock(azp, stream);
             if (wzp) {
-                mem_lock<int8_t> wzp_lock(wzp, stream);
+                mem_lock<int8_t, mem_lock_type::read> wzp_lock(wzp, stream);
                 fill_compensation_typed(w_lock.data(), azp_lock.data(), wzp_lock.data(), comp_lock.data(), GS, OC, IC, KS);
             } else {
                 fill_compensation_typed(w_lock.data(), azp_lock.data(), static_cast<int8_t*>(nullptr), comp_lock.data(), GS, OC, IC, KS);
             }
         } else if (w_dt == data_types::i8 && azp_dt == data_types::i8) {
-            mem_lock<int8_t> w_lock(w, stream);
-            mem_lock<int8_t> azp_lock(azp, stream);
+            mem_lock<int8_t, mem_lock_type::read> w_lock(w, stream);
+            mem_lock<int8_t, mem_lock_type::read> azp_lock(azp, stream);
             if (wzp) {
-                mem_lock<int8_t> wzp_lock(wzp, stream);
+                mem_lock<int8_t, mem_lock_type::read> wzp_lock(wzp, stream);
                 fill_compensation_typed(w_lock.data(), azp_lock.data(), wzp_lock.data(), comp_lock.data(), GS, OC, IC, KS);
             } else {
                 fill_compensation_typed(w_lock.data(), azp_lock.data(), static_cast<int8_t*>(nullptr), comp_lock.data(), GS, OC, IC, KS);
             }
         } else if (w_dt == data_types::u8 && azp_dt == data_types::i8) {
-            mem_lock<uint8_t> w_lock(w, stream);
-            mem_lock<int8_t> azp_lock(azp, stream);
+            mem_lock<uint8_t, mem_lock_type::read> w_lock(w, stream);
+            mem_lock<int8_t, mem_lock_type::read> azp_lock(azp, stream);
             if (wzp) {
-                mem_lock<uint8_t> wzp_lock(wzp, stream);
+                mem_lock<uint8_t, mem_lock_type::read> wzp_lock(wzp, stream);
                 fill_compensation_typed(w_lock.data(), azp_lock.data(), wzp_lock.data(), comp_lock.data(), GS, OC, IC, KS);
             } else {
                 fill_compensation_typed(w_lock.data(), azp_lock.data(), static_cast<uint8_t*>(nullptr), comp_lock.data(), GS, OC, IC, KS);
@@ -579,8 +579,8 @@ void prepare_quantization::prepare_asymmetric_quantization(program &p, convoluti
         int s = new_a_zp->get_output_layout().size.feature[0];
         auto azp_aligned = p.get_engine().allocate_memory(l);
         auto old_ptr = new_a_zp->as<data>().get_attached_memory_ptr();
-        mem_lock<int8_t> new_data{azp_aligned, stream};
-        mem_lock<int8_t> old_data{old_ptr, stream};
+        mem_lock<int8_t, mem_lock_type::write> new_data{azp_aligned, stream};
+        mem_lock<int8_t, mem_lock_type::read> old_data{old_ptr, stream};
         for (int i = 0; i < ifm_aligned; i++) {
             new_data.data()[i] = old_data.data()[i % s];
         }
@@ -602,8 +602,8 @@ void prepare_quantization::prepare_asymmetric_quantization(program &p, convoluti
         int s = new_w_zp->get_output_layout().size.batch[0];
         auto wzp_aligned = p.get_engine().allocate_memory(l);
         auto old_ptr = new_w_zp->as<data>().get_attached_memory_ptr();
-        mem_lock<int8_t> new_data{wzp_aligned, stream};
-        mem_lock<int8_t> old_data{old_ptr, stream};
+        mem_lock<int8_t, mem_lock_type::write> new_data{wzp_aligned, stream};
+        mem_lock<int8_t, mem_lock_type::read> old_data{old_ptr, stream};
         for (int i = 0; i < ofm_aligned; i++) {
             new_data.data()[i] = old_data.data()[i % s];
         }

diff --git a/inference-engine/thirdparty/clDNN/src/impls/common/condition.cpp b/inference-engine/thirdparty/clDNN/src/impls/common/condition.cpp
@@ -34,8 +34,8 @@ struct condition_impl : typed_primitive_impl<condition> {
         else
             memory_to_copy = execute_branch(instance.get_net_false(), instance.result_id(), instance.input_memory_ptr());
         // just copy memory
-        mem_lock<float> inp_ptr{memory_to_copy, instance.get_network().get_stream()};
-        mem_lock<float> out_ptr{instance.output_memory_ptr(), instance.get_network().get_stream()};
+        mem_lock<float, mem_lock_type::read> inp_ptr{memory_to_copy, instance.get_network().get_stream()};
+        mem_lock<float, mem_lock_type::write> out_ptr{instance.output_memory_ptr(), instance.get_network().get_stream()};
         std::copy(inp_ptr.begin(), inp_ptr.end(), out_ptr.begin());
         ev->set();
         return ev;
@@ -71,11 +71,11 @@ struct condition_impl : typed_primitive_impl<condition> {
     Returns boolean flag, which says what branch should be executed.
     */
     bool choose_branch_to_exec(condition_inst& instance) const {
-        mem_lock<float> lock_compare_data{instance.compare_memory_ptr(), instance.get_network().get_stream()};
+        mem_lock<float, mem_lock_type::read> lock_compare_data{instance.compare_memory_ptr(), instance.get_network().get_stream()};
         auto compare_layout = instance.compare_memory().get_layout();
         auto compare_ptr = lock_compare_data.begin();
 
-        mem_lock<float> lock_input{instance.input_memory_ptr(), instance.get_network().get_stream()};
+        mem_lock<float, mem_lock_type::read> lock_input{instance.input_memory_ptr(), instance.get_network().get_stream()};
         auto input_layout = instance.input_memory().get_layout();
         auto input_ptr = lock_input.begin();