From f0f82e4ad2be48d5d35a1a50cfb8b8dc0c562c42 Mon Sep 17 00:00:00 2001 From: geeky33 Date: Fri, 3 Jan 2025 01:56:36 +0530 Subject: [PATCH 01/11] Implemented CPU plugin just-in-time emitter for NotEqual operation --- .../plugin/aarch64/jit_eltwise_emitters.cpp | 49 +++++++++++++++++++ .../plugin/aarch64/jit_eltwise_emitters.hpp | 29 +++++++++++ .../nodes/executors/aarch64/jit_eltwise.cpp | 2 +- .../aarch64/jit_uni_eltwise_generic.cpp | 2 + 4 files changed, 81 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index 534470c746f2fe..9b38543ae7e552 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -288,6 +288,55 @@ void jit_equal_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } +/// NOTEQUAL /// +jit_not_equal_emitter::jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { + prepare_table(); +} + +jit_not_equal_emitter::jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { + prepare_table(); +} + +size_t jit_not_equal_emitter::get_inputs_count() const { return 2; } + +size_t jit_not_equal_emitter::get_aux_vecs_count() const { return 1; } + +size_t jit_not_equal_emitter::get_aux_gprs_count() const { return 1; } + +std::set> jit_not_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + +void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} +template +void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + const TReg src1 = TReg(in_vec_idxs[0]); + const TReg src2 = TReg(in_vec_idxs[1]); + const TReg dst = TReg(out_vec_idxs[0]); + const TReg aux = TReg(aux_vec_idxs[0]); + h->fcmeq(dst.s, src1.s, src2.s); + h->mvn(dst.b16, dst.b16); + h->ld1r(aux.s, table_val2("one")); + h->and_(dst.b16, dst.b16, aux.b16); +} +void jit_not_equal_emitter::register_table_entries() { + push_arg_entry_of("one", 0x3f800000, true); +} + /// ELU /// jit_elu_emitter::jit_elu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index 13567b6fbf7d64..8540141beb7b14 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -137,6 +137,35 @@ class jit_equal_emitter : public jit_emitter { void register_table_entries() override; }; +class jit_not_equal_emitter : public jit_emitter { +public: + jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + + jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n); + + size_t get_inputs_count() const override; + + size_t get_aux_vecs_count() const override; + + size_t get_aux_gprs_count() const override; + + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; +}; + class jit_exp_emitter : public jit_emitter { public: jit_exp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 86d090a858fd7b..310a148eda964a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -47,7 +47,7 @@ bool JitEltwiseExecutor::isSupported(const Algorithm& algorithm, Algorithm::EltwiseMod, Algorithm::EltwiseMultiply, Algorithm::EltwiseMulAdd, - Algorithm::EltwisePowerStatic, + Algorithm::EltwiseNotEqual, Algorithm::EltwisePowerStatic, Algorithm::EltwisePrelu, Algorithm::EltwiseRelu, Algorithm::EltwiseRoundHalfAwayFromZero, diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index 4b4b07df572b4a..ee1436862035e7 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -681,6 +681,7 @@ std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitte OV_CASE(Algorithm::EltwiseGreater, ov::intel_cpu::aarch64::jit_greater_emitter), OV_CASE(Algorithm::EltwiseGreaterEqual, ov::intel_cpu::aarch64::jit_greater_equal_emitter), OV_CASE(Algorithm::EltwiseMulAdd, ov::intel_cpu::aarch64::jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseNotEqual, ov::intel_cpu::aarch64::jit_not_equal_emitter), OV_CASE(Algorithm::EltwiseMod, ov::intel_cpu::aarch64::jit_mod_emitter), OV_CASE(Algorithm::EltwiseMultiply, ov::intel_cpu::aarch64::jit_multiply_emitter), OV_CASE(Algorithm::EltwisePowerStatic, ov::intel_cpu::aarch64::jit_power_static_emitter), @@ -869,6 +870,7 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseMish, jit_mish_emitter), OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseNotEqual, jit_not_equal_emitter), OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), From 6797eecd191af42d9067874b2fa86ccfe8bfd88d Mon Sep 17 00:00:00 2001 From: "ayraa.ai" <141430616+geeky33@users.noreply.github.com> Date: Tue, 14 Jan 2025 21:16:36 +0530 Subject: [PATCH 02/11] Updated jit_eltwise_emitters.cpp --- .../src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index 152655ff56a38c..0441235dab26a9 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -330,7 +330,7 @@ void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, con const TReg dst = TReg(out_vec_idxs[0]); const TReg aux = TReg(aux_vec_idxs[0]); h->fcmeq(dst.s, src1.s, src2.s); - h->mvn(dst.b16, dst.b16); + h->not_(dst.b16, dst.b16); h->ld1r(aux.s, table_val2("one")); h->and_(dst.b16, dst.b16, aux.b16); } From 6aaea6d3bb96a3207b86b33c525d42d8bc4d35e1 Mon Sep 17 00:00:00 2001 From: "ayraa.ai" <141430616+geeky33@users.noreply.github.com> Date: Tue, 21 Jan 2025 16:02:38 +0530 Subject: [PATCH 03/11] Updated jit_eltwise_emitters.cpp --- .../plugin/aarch64/jit_eltwise_emitters.cpp | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index b0a18357c47073..f21aba522c76b1 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -304,17 +304,25 @@ jit_not_equal_emitter::jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_gener prepare_table(); } -size_t jit_not_equal_emitter::get_inputs_count() const { return 2; } +size_t jit_not_equal_emitter::get_inputs_count() const { + return 2; +} -size_t jit_not_equal_emitter::get_aux_vecs_count() const { return 1; } +size_t jit_not_equal_emitter::get_aux_vecs_count() const { + return 1; +} -size_t jit_not_equal_emitter::get_aux_gprs_count() const { return 1; } +size_t jit_not_equal_emitter::get_aux_gprs_count() const { + return 1; +} -std::set> jit_not_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_not_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { emit_isa(in_vec_idxs, out_vec_idxs); } else { @@ -322,14 +330,18 @@ void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, co } } template -void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, + const std::vector &out_vec_idxs) const { OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; const TReg src1 = TReg(in_vec_idxs[0]); const TReg src2 = TReg(in_vec_idxs[1]); const TReg dst = TReg(out_vec_idxs[0]); const TReg aux = TReg(aux_vec_idxs[0]); + h->fcmeq(dst.s, src1.s, src2.s); + h->not_(dst.b16, dst.b16); h->ld1r(aux.s, table_val2("one")); h->and_(dst.b16, dst.b16, aux.b16); From 0507f00db284f3ea8209f6ff89884108dfe9f224 Mon Sep 17 00:00:00 2001 From: "ayraa.ai" <141430616+geeky33@users.noreply.github.com> Date: Tue, 21 Jan 2025 16:08:30 +0530 Subject: [PATCH 04/11] Updated jit_eltwise_emitters.hpp --- .../plugin/aarch64/jit_eltwise_emitters.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index 7058b51539ce35..bdce88bddaa00c 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -140,13 +140,13 @@ class jit_equal_emitter : public jit_emitter { class jit_not_equal_emitter : public jit_emitter { public: jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc = ov::element::f32); + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& n); + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_count() const override; @@ -155,13 +155,13 @@ class jit_not_equal_emitter : public jit_emitter { size_t get_aux_gprs_count() const override; static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; From 78e5b4d1be8aebbe15769ea37805dd3ec96c1734 Mon Sep 17 00:00:00 2001 From: "ayraa.ai" <141430616+geeky33@users.noreply.github.com> Date: Tue, 21 Jan 2025 16:13:21 +0530 Subject: [PATCH 05/11] Updated jit_eltwise.cpp --- .../intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 795e6d4d6ee0d9..4ed2b498165ee5 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -50,7 +50,8 @@ bool JitEltwiseExecutor::isSupported(const Algorithm& algorithm, Algorithm::EltwiseMod, Algorithm::EltwiseMultiply, Algorithm::EltwiseMulAdd, - Algorithm::EltwiseNotEqual, Algorithm::EltwisePowerStatic, + Algorithm::EltwiseNotEqual, + Algorithm::EltwisePowerStatic, Algorithm::EltwisePrelu, Algorithm::EltwiseRelu, Algorithm::EltwiseRoundHalfAwayFromZero, From a55d4e83483758140c2fa48cc59643046e29ad38 Mon Sep 17 00:00:00 2001 From: "ayraa.ai" <141430616+geeky33@users.noreply.github.com> Date: Tue, 21 Jan 2025 21:41:52 +0530 Subject: [PATCH 06/11] Updated jit_eltwise_emitters.cpp --- .../plugin/aarch64/jit_eltwise_emitters.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index f21aba522c76b1..ae5145bcdc351c 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -291,20 +291,20 @@ void jit_equal_emitter::register_table_entries() { /// NOTEQUAL /// jit_not_equal_emitter::jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { prepare_table(); } jit_not_equal_emitter::jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_not_equal_emitter::get_inputs_count() const { +size_t jit_not_equal_emitter::get_inputs_count() const { return 2; } From 5ee07f0dc30277150874136216191957d04d8811 Mon Sep 17 00:00:00 2001 From: geeky33 Date: Wed, 22 Jan 2025 01:08:15 +0530 Subject: [PATCH 07/11] switching to new branch for new PR --- .../plugin/aarch64/jit_eltwise_emitters.cpp | 60 ------------------- .../plugin/aarch64/jit_eltwise_emitters.hpp | 29 --------- .../nodes/executors/aarch64/jit_eltwise.cpp | 1 - .../aarch64/jit_uni_eltwise_generic.cpp | 2 - 4 files changed, 92 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index ae5145bcdc351c..d2eeca3452690a 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -289,66 +289,6 @@ void jit_equal_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } -/// NOTEQUAL /// -jit_not_equal_emitter::jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& node) - : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) { - prepare_table(); -} - -jit_not_equal_emitter::jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc) { - prepare_table(); -} - -size_t jit_not_equal_emitter::get_inputs_count() const { - return 2; -} - -size_t jit_not_equal_emitter::get_aux_vecs_count() const { - return 1; -} - -size_t jit_not_equal_emitter::get_aux_gprs_count() const { - return 1; -} - -std::set> jit_not_equal_emitter::get_supported_precisions( - const std::shared_ptr& node) { - return {{element::f32, element::f32}}; -} - -void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, - const std::vector& out_vec_idxs) const { - if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { - emit_isa(in_vec_idxs, out_vec_idxs); - } else { - OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); - } -} -template -void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, - const std::vector &out_vec_idxs) const { - OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); - - using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; - const TReg src1 = TReg(in_vec_idxs[0]); - const TReg src2 = TReg(in_vec_idxs[1]); - const TReg dst = TReg(out_vec_idxs[0]); - const TReg aux = TReg(aux_vec_idxs[0]); - - h->fcmeq(dst.s, src1.s, src2.s); - - h->not_(dst.b16, dst.b16); - h->ld1r(aux.s, table_val2("one")); - h->and_(dst.b16, dst.b16, aux.b16); -} -void jit_not_equal_emitter::register_table_entries() { - push_arg_entry_of("one", 0x3f800000, true); -} /// ELU /// jit_elu_emitter::jit_elu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index bdce88bddaa00c..5d0e00e2da42b0 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -137,35 +137,6 @@ class jit_equal_emitter : public jit_emitter { void register_table_entries() override; }; -class jit_not_equal_emitter : public jit_emitter { -public: - jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const ov::element::Type exec_prc = ov::element::f32); - - - jit_not_equal_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, - dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, - const std::shared_ptr& n); - - size_t get_inputs_count() const override; - - size_t get_aux_vecs_count() const override; - - size_t get_aux_gprs_count() const override; - - static std::set> get_supported_precisions( - const std::shared_ptr& node = nullptr); - -private: - void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; - - template - void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; - - void register_table_entries() override; -}; - class jit_exp_emitter : public jit_emitter { public: jit_exp_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 4ed2b498165ee5..8d5e905f10e86a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -50,7 +50,6 @@ bool JitEltwiseExecutor::isSupported(const Algorithm& algorithm, Algorithm::EltwiseMod, Algorithm::EltwiseMultiply, Algorithm::EltwiseMulAdd, - Algorithm::EltwiseNotEqual, Algorithm::EltwisePowerStatic, Algorithm::EltwisePrelu, Algorithm::EltwiseRelu, diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index cfb6d1b504ba01..5e69cfb36b5462 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -685,7 +685,6 @@ std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitte OV_CASE(Algorithm::EltwiseGreater, ov::intel_cpu::aarch64::jit_greater_emitter), OV_CASE(Algorithm::EltwiseGreaterEqual, ov::intel_cpu::aarch64::jit_greater_equal_emitter), OV_CASE(Algorithm::EltwiseMulAdd, ov::intel_cpu::aarch64::jit_mul_add_emitter), - OV_CASE(Algorithm::EltwiseNotEqual, ov::intel_cpu::aarch64::jit_not_equal_emitter), OV_CASE(Algorithm::EltwiseMod, ov::intel_cpu::aarch64::jit_mod_emitter), OV_CASE(Algorithm::EltwiseMultiply, ov::intel_cpu::aarch64::jit_multiply_emitter), OV_CASE(Algorithm::EltwisePowerStatic, ov::intel_cpu::aarch64::jit_power_static_emitter), @@ -876,7 +875,6 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseMish, jit_mish_emitter), OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), - OV_CASE(Algorithm::EltwiseNotEqual, jit_not_equal_emitter), OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), From 7c1f575c3ab8b9c33f55071643fba1a2c8f8242e Mon Sep 17 00:00:00 2001 From: geeky33 Date: Wed, 22 Jan 2025 02:22:44 +0530 Subject: [PATCH 08/11] Added support for aten::quantile and its tests --- src/frontends/pytorch/src/op/quantile.cpp | 96 +++++++++++++++++++ src/frontends/pytorch/src/op_table.cpp | 2 + .../pytorch_tests/test_quantile.py | 36 +++++++ 3 files changed, 134 insertions(+) create mode 100644 src/frontends/pytorch/src/op/quantile.cpp create mode 100644 tests/layer_tests/pytorch_tests/test_quantile.py diff --git a/src/frontends/pytorch/src/op/quantile.cpp b/src/frontends/pytorch/src/op/quantile.cpp new file mode 100644 index 00000000000000..4c9bf95a5e9648 --- /dev/null +++ b/src/frontends/pytorch/src/op/quantile.cpp @@ -0,0 +1,96 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/frontend/pytorch/node_context.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/range.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/opsets/opset10.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/floor.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/maximum.hpp" +#include "openvino/op/minimum.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace op { + +using namespace ov::op; + +OutputVector translate_quantile(const NodeContext& context) { + // aten::quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor + num_inputs_check(context, 2, 4); + + auto input = context.get_input(0); + auto quantiles = context.get_input(1); + + // Handle optional inputs + auto dim = context.input_is_none(2) ? -1 : context.get_input(2); + auto keepdim = context.input_is_none(3) ? false : context.get_input(3); + + // If dim is -1 (not specified), flatten the tensor + if (dim == -1) { + input = context.mark_node(std::make_shared( + input, context.mark_node(v0::Constant::create(element::i64, {1}, {-1})), true)); + dim = 0; // Set the dimension to 0 for the flattened tensor + } + + // Sort the tensor along the specified dimension + auto sort_result = context.mark_node(std::make_shared(input, dim, true)); + auto sorted_tensor = sort_result->output(0); + + // Get the size of the specified dimension + auto input_shape = context.mark_node(std::make_shared(input)); + auto dim_size = context.mark_node(std::make_shared( + input_shape, context.mark_node(v0::Constant::create(element::i64, {}, {dim})), + v0::Constant::create(element::i64, {}, {0}))); + + // Compute quantile indices: q * (dim_size - 1) + auto scaled_q = context.mark_node(std::make_shared( + quantiles, context.mark_node(std::make_shared( + dim_size, v0::Constant::create(element::i64, {}, {1}))))); + auto lower_indices = context.mark_node(std::make_shared(scaled_q)); + auto upper_indices = context.mark_node(std::make_shared( + lower_indices, v0::Constant::create(element::i64, {}, {1}))); + + // Clamp indices to valid range + lower_indices = context.mark_node(std::make_shared( + lower_indices, v0::Constant::create(element::i64, {}, {0}))); + upper_indices = context.mark_node(std::make_shared( + upper_indices, context.mark_node(std::make_shared( + dim_size, v0::Constant::create(element::i64, {}, {1}))))); + + // Gather values at the indices + auto lower_values = context.mark_node(std::make_shared(sorted_tensor, lower_indices, dim)); + auto upper_values = context.mark_node(std::make_shared(sorted_tensor, upper_indices, dim)); + + // Compute interpolation weights + auto weights = context.mark_node(std::make_shared(scaled_q, lower_indices)); + + // Interpolate between lower and upper values + auto result = context.mark_node(std::make_shared( + lower_values, context.mark_node(std::make_shared(weights, context.mark_node(std::make_shared(upper_values, lower_values)))))); + + // Reshape if keepdim is false + if (!keepdim) { + auto input_shape = context.mark_node(std::make_shared(input)); + auto output_shape = context.mark_node(std::make_shared( + input_shape, + context.mark_node(v0::Constant::create(element::i64, {1}, {dim})), + v0::Constant::create(element::i64, {}, {0}))); + result = context.mark_node(std::make_shared(result, output_shape, true)); + } + + return {result}; +} + +} // namespace op +} // namespace pytorch +} // namespace frontend +} // namespace ov diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 00e3a55b0bc327..d6ffa242fab2f2 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -190,6 +190,7 @@ OP_CONVERTER(translate_quantized_add); OP_CONVERTER(translate_quantized_add_relu); OP_CONVERTER(translate_quantized_hardswish); OP_CONVERTER(translate_quantized_mul); +OP_CONVERTER(translate_quantile); OP_CONVERTER(translate_range_length); OP_CONVERTER(translate_rand); OP_CONVERTER(translate_randn); @@ -745,6 +746,7 @@ const std::unordered_map get_supported_ops_ts() { {"quantized::hardswish", op::translate_quantized_hardswish}, {"quantized::linear", op::translate_quantized_linear}, {"quantized::mul", op::translate_quantized_mul}, + {"quantized::relu", op::translate_quantile}, {"torchvision::deform_conv2d", op::translate_deform_conv}, {"torchvision::nms", op::translate_nms}, {"torchvision::roi_align", op::translate_roi_align}, diff --git a/tests/layer_tests/pytorch_tests/test_quantile.py b/tests/layer_tests/pytorch_tests/test_quantile.py new file mode 100644 index 00000000000000..4dff96d4c96de2 --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_quantile.py @@ -0,0 +1,36 @@ +# Copyright (C) 2018-2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import numpy as np +import torch +from pytorch_layer_test_class import PytorchLayerTest + + +class TestQuantile(PytorchLayerTest): + def _prepare_input(self): + input_tensor = np.random.randn(1, 3, 224, 224).astype(np.float32) + quantile = np.array(0.5, dtype=np.float32) + return (input_tensor, quantile) + + def create_model(self, dim=None, keepdim=False): + class aten_quantile(torch.nn.Module): + def __init__(self, dim, keepdim): + super(aten_quantile, self).__init__() + self.dim = dim + self.keepdim = keepdim + + def forward(self, x, q): + return torch.quantile(x, q, dim=self.dim, keepdim=self.keepdim) + + ref_net = None + + return aten_quantile(dim, keepdim), ref_net, "aten::quantile" + + @pytest.mark.parametrize("dim", [None, 0, 1, 2, 3, -1, -2, -3]) + @pytest.mark.parametrize("keepdim", [True, False]) + @pytest.mark.nightly + @pytest.mark.precommit + def test_quantile(self, dim, keepdim, ie_device, precision, ir_version): + self._test(*self.create_model(dim, keepdim), ie_device, precision, ir_version) + From 2ff981951a0014ed933da622c0c252c457dca8dc Mon Sep 17 00:00:00 2001 From: geeky33 Date: Wed, 22 Jan 2025 02:26:42 +0530 Subject: [PATCH 09/11] Added support for aten::quantile and its tests --- src/frontends/pytorch/src/op/quantile.cpp | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/frontends/pytorch/src/op/quantile.cpp b/src/frontends/pytorch/src/op/quantile.cpp index 4c9bf95a5e9648..1340ed90871090 100644 --- a/src/frontends/pytorch/src/op/quantile.cpp +++ b/src/frontends/pytorch/src/op/quantile.cpp @@ -24,34 +24,28 @@ namespace op { using namespace ov::op; OutputVector translate_quantile(const NodeContext& context) { - // aten::quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor num_inputs_check(context, 2, 4); auto input = context.get_input(0); auto quantiles = context.get_input(1); - // Handle optional inputs auto dim = context.input_is_none(2) ? -1 : context.get_input(2); auto keepdim = context.input_is_none(3) ? false : context.get_input(3); - // If dim is -1 (not specified), flatten the tensor if (dim == -1) { input = context.mark_node(std::make_shared( input, context.mark_node(v0::Constant::create(element::i64, {1}, {-1})), true)); - dim = 0; // Set the dimension to 0 for the flattened tensor + dim = 0; } - // Sort the tensor along the specified dimension auto sort_result = context.mark_node(std::make_shared(input, dim, true)); auto sorted_tensor = sort_result->output(0); - // Get the size of the specified dimension auto input_shape = context.mark_node(std::make_shared(input)); auto dim_size = context.mark_node(std::make_shared( input_shape, context.mark_node(v0::Constant::create(element::i64, {}, {dim})), v0::Constant::create(element::i64, {}, {0}))); - // Compute quantile indices: q * (dim_size - 1) auto scaled_q = context.mark_node(std::make_shared( quantiles, context.mark_node(std::make_shared( dim_size, v0::Constant::create(element::i64, {}, {1}))))); @@ -59,25 +53,20 @@ OutputVector translate_quantile(const NodeContext& context) { auto upper_indices = context.mark_node(std::make_shared( lower_indices, v0::Constant::create(element::i64, {}, {1}))); - // Clamp indices to valid range lower_indices = context.mark_node(std::make_shared( lower_indices, v0::Constant::create(element::i64, {}, {0}))); upper_indices = context.mark_node(std::make_shared( upper_indices, context.mark_node(std::make_shared( dim_size, v0::Constant::create(element::i64, {}, {1}))))); - // Gather values at the indices auto lower_values = context.mark_node(std::make_shared(sorted_tensor, lower_indices, dim)); auto upper_values = context.mark_node(std::make_shared(sorted_tensor, upper_indices, dim)); - // Compute interpolation weights auto weights = context.mark_node(std::make_shared(scaled_q, lower_indices)); - // Interpolate between lower and upper values auto result = context.mark_node(std::make_shared( lower_values, context.mark_node(std::make_shared(weights, context.mark_node(std::make_shared(upper_values, lower_values)))))); - // Reshape if keepdim is false if (!keepdim) { auto input_shape = context.mark_node(std::make_shared(input)); auto output_shape = context.mark_node(std::make_shared( From ae08bd360508b517e922b35e11a830104add1ec5 Mon Sep 17 00:00:00 2001 From: geeky33 Date: Wed, 22 Jan 2025 02:41:36 +0530 Subject: [PATCH 10/11] Reverted some relevant changes --- .../src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index d2eeca3452690a..b1e64cd25ba0b4 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -289,7 +289,6 @@ void jit_equal_emitter::register_table_entries() { push_arg_entry_of("one", 0x3f800000, true); } - /// ELU /// jit_elu_emitter::jit_elu_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, From 156d2f08da073caabf93612129ed37491ade545d Mon Sep 17 00:00:00 2001 From: "ayraa.ai" <141430616+geeky33@users.noreply.github.com> Date: Thu, 23 Jan 2025 02:33:45 +0530 Subject: [PATCH 11/11] Updated quantile.cpp --- src/frontends/pytorch/src/op/quantile.cpp | 81 +++++++++++------------ 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/src/frontends/pytorch/src/op/quantile.cpp b/src/frontends/pytorch/src/op/quantile.cpp index 1340ed90871090..2eefb34bbca029 100644 --- a/src/frontends/pytorch/src/op/quantile.cpp +++ b/src/frontends/pytorch/src/op/quantile.cpp @@ -16,64 +16,59 @@ #include "openvino/op/minimum.hpp" #include "utils.hpp" -namespace ov { -namespace frontend { -namespace pytorch { -namespace op { - using namespace ov::op; OutputVector translate_quantile(const NodeContext& context) { - num_inputs_check(context, 2, 4); + num_inputs_check(context, 2, 5); auto input = context.get_input(0); - auto quantiles = context.get_input(1); + auto q = context.get_input(1); // Quantile(s), can be float or tensor auto dim = context.input_is_none(2) ? -1 : context.get_input(2); auto keepdim = context.input_is_none(3) ? false : context.get_input(3); + auto interpolation = context.input_is_none(4) ? "linear" : context.get_input(4); + if (dim == -1) { input = context.mark_node(std::make_shared( - input, context.mark_node(v0::Constant::create(element::i64, {1}, {-1})), true)); + input, context.mark_node(std::make_shared(0, input.get_shape().size(), 1)), true)); dim = 0; } - auto sort_result = context.mark_node(std::make_shared(input, dim, true)); - auto sorted_tensor = sort_result->output(0); - - auto input_shape = context.mark_node(std::make_shared(input)); - auto dim_size = context.mark_node(std::make_shared( - input_shape, context.mark_node(v0::Constant::create(element::i64, {}, {dim})), - v0::Constant::create(element::i64, {}, {0}))); - - auto scaled_q = context.mark_node(std::make_shared( - quantiles, context.mark_node(std::make_shared( - dim_size, v0::Constant::create(element::i64, {}, {1}))))); - auto lower_indices = context.mark_node(std::make_shared(scaled_q)); - auto upper_indices = context.mark_node(std::make_shared( - lower_indices, v0::Constant::create(element::i64, {}, {1}))); - - lower_indices = context.mark_node(std::make_shared( - lower_indices, v0::Constant::create(element::i64, {}, {0}))); - upper_indices = context.mark_node(std::make_shared( - upper_indices, context.mark_node(std::make_shared( - dim_size, v0::Constant::create(element::i64, {}, {1}))))); - - auto lower_values = context.mark_node(std::make_shared(sorted_tensor, lower_indices, dim)); - auto upper_values = context.mark_node(std::make_shared(sorted_tensor, upper_indices, dim)); - - auto weights = context.mark_node(std::make_shared(scaled_q, lower_indices)); - - auto result = context.mark_node(std::make_shared( - lower_values, context.mark_node(std::make_shared(weights, context.mark_node(std::make_shared(upper_values, lower_values)))))); - + auto sorted = context.mark_node(std::make_shared(input, dim, true)); // Ascending order + + auto dim_size = input.get_shape()[dim]; + + auto indices = context.mark_node(std::make_shared(q, dim_size - 1)); + auto lower_indices = context.mark_node(std::make_shared(indices)); + auto upper_indices = context.mark_node(std::make_shared(lower_indices, 1)); + auto weights = context.mark_node(std::make_shared(indices, lower_indices)); + auto lower_values = context.mark_node(std::make_shared(sorted, lower_indices, dim)); + auto upper_values = context.mark_node(std::make_shared(sorted, upper_indices, dim)); + + Output result; + if (interpolation == "linear") { + result = context.mark_node(std::make_shared( + lower_values, context.mark_node(std::make_shared(weights, upper_values)))); + } else if (interpolation == "lower") { + result = lower_values; + } else if (interpolation == "higher") { + result = upper_values; + } else if (interpolation == "nearest") { + auto nearest_indices = context.mark_node(std::make_shared(indices)); + result = context.mark_node(std::make_shared(sorted, nearest_indices, dim)); + } else if (interpolation == "midpoint") { + result = context.mark_node(std::make_shared( + lower_values, context.mark_node(std::make_shared( + context.mark_node(std::make_shared(element::f32, Shape{}, 0.5)), + context.mark_node(std::make_shared(upper_values, lower_values)))))); + } else { + throw std::runtime_error("Unsupported interpolation method: " + interpolation); + } if (!keepdim) { - auto input_shape = context.mark_node(std::make_shared(input)); - auto output_shape = context.mark_node(std::make_shared( - input_shape, - context.mark_node(v0::Constant::create(element::i64, {1}, {dim})), - v0::Constant::create(element::i64, {}, {0}))); - result = context.mark_node(std::make_shared(result, output_shape, true)); + auto reshape_dims = input.get_shape(); + reshape_dims.erase(reshape_dims.begin() + dim); + result = context.mark_node(std::make_shared(result, reshape_dims, true)); } return {result};