From 23bd89ed87a7e435150e37a9cbcb87db6a05debe Mon Sep 17 00:00:00 2001
From: Yang Hau <yuanyanghau@gmail.com>
Date: Tue, 30 Jul 2024 14:16:49 +0800
Subject: [PATCH] feat: vqdmlsl_high_lane[q]_[s16|s32]

---
 neon2rvv.h     |  32 +++++++++++++--
 tests/impl.cpp | 108 +++++++++++++++++++++++++++++++++++++++++++++++--
 tests/impl.h   |   8 ++--
 3 files changed, 136 insertions(+), 12 deletions(-)

diff --git a/neon2rvv.h b/neon2rvv.h
index 6f123193..1de140a8 100644
--- a/neon2rvv.h
+++ b/neon2rvv.h
@@ -9446,9 +9446,21 @@ FORCE_INLINE int64_t vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c, const
   return sat_sub_int64(a, dmull);
 }
 
-// FORCE_INLINE int32x4_t vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t c, const int lane);
+FORCE_INLINE int32x4_t vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t c, const int lane) {
+  vint16m1_t b_high = __riscv_vslidedown_vx_i16m1(b, 4, 4);
+  vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
+  vint32m1_t bc_mul = __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vwmul_vv_i32m2(b_high, c_dup, 4));
+  vint32m1_t bc_mulx2 = __riscv_vsll_vx_i32m1(bc_mul, 1, 4);
+  return __riscv_vsub_vv_i32m1(a, bc_mulx2, 4);
+}
 
-// FORCE_INLINE int64x2_t vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t c, const int lane);
+FORCE_INLINE int64x2_t vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t c, const int lane) {
+  vint32m1_t b_high = __riscv_vslidedown_vx_i32m1(b, 2, 2);
+  vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
+  vint64m1_t bc_mul = __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vwmul_vv_i64m2(b_high, c_dup, 2));
+  vint64m1_t bc_mulx2 = __riscv_vsll_vx_i64m1(bc_mul, 1, 2);
+  return __riscv_vsub_vv_i64m1(a, bc_mulx2, 2);
+}
 
 FORCE_INLINE int32x4_t vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t c, const int lane) {
   vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8);
@@ -9478,9 +9490,21 @@ FORCE_INLINE int64_t vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c, const
   return sat_sub_int64(a, dmull);
 }
 
-// FORCE_INLINE int32x4_t vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t c, const int lane);
+FORCE_INLINE int32x4_t vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t c, const int lane) {
+  vint16m1_t b_high = __riscv_vslidedown_vx_i16m1(b, 4, 8);
+  vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
+  vint32m1_t bc_mul = __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vwmul_vv_i32m2(b_high, c_dup, 4));
+  vint32m1_t bc_mulx2 = __riscv_vsll_vx_i32m1(bc_mul, 1, 4);
+  return __riscv_vsub_vv_i32m1(a, bc_mulx2, 4);
+}
 
-// FORCE_INLINE int64x2_t vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t c, const int lane);
+FORCE_INLINE int64x2_t vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t c, const int lane) {
+  vint32m1_t b_high = __riscv_vslidedown_vx_i32m1(b, 2, 4);
+  vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
+  vint64m1_t bc_mul = __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vwmul_vv_i64m2(b_high, c_dup, 2));
+  vint64m1_t bc_mulx2 = __riscv_vsll_vx_i64m1(bc_mul, 1, 2);
+  return __riscv_vsub_vv_i64m1(a, bc_mulx2, 2);
+}
 
 FORCE_INLINE int32x4_t vmull_lane_s16(int16x4_t a, int16x4_t b, const int c) {
   vint16m1_t b_dup = __riscv_vrgather_vx_i16m1(b, c, 4);
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 1ff92cd5..42f647a7 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -34258,9 +34258,59 @@ result_t test_vqdmlsls_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #endif  // ENABLE_TEST_ALL
 }
 
-result_t test_vqdmlsl_high_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vqdmlsl_high_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
+  const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
+  const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3;
+  int32x4_t a = vld1q_s32(_a);
+  int16x8_t b = vld1q_s16(_b);
+  int16x4_t c = vld1_s16(_c);
+  int32x4_t d;
+  int32_t _d[4];
+
+#define TEST_IMPL(IDX)                                     \
+  for (int i = 0; i < 4; i++) {                            \
+    _d[i] = sat_sub(_a[i], sat_dmull(_b[i + 4], _c[IDX])); \
+  }                                                        \
+  d = vqdmlsl_high_lane_s16(a, b, c, IDX);                 \
+  CHECK_RESULT(validate_int32(d, _d[0], _d[1], _d[2], _d[3]))
+
+  IMM_4_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
-result_t test_vqdmlsl_high_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vqdmlsl_high_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const int64_t *_a = (int64_t *)impl.test_cases_int_pointer1;
+  const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
+  const int32_t *_c = (int32_t *)impl.test_cases_int_pointer3;
+  int64x2_t a = vld1q_s64(_a);
+  int32x4_t b = vld1q_s32(_b);
+  int32x2_t c = vld1_s32(_c);
+  int64x2_t d;
+  int64_t _d[2];
+
+#define TEST_IMPL(IDX)                                     \
+  for (int i = 0; i < 2; i++) {                            \
+    _d[i] = sat_sub(_a[i], sat_dmull(_b[i + 2], _c[IDX])); \
+  }                                                        \
+  d = vqdmlsl_high_lane_s32(a, b, c, IDX);                 \
+  CHECK_RESULT(validate_int64(d, _d[0], _d[1]))
+
+  IMM_2_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
 result_t test_vqdmlsl_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #ifdef ENABLE_TEST_ALL
@@ -34360,9 +34410,59 @@ result_t test_vqdmlsls_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter)
 #endif  // ENABLE_TEST_ALL
 }
 
-result_t test_vqdmlsl_high_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vqdmlsl_high_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
+  const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
+  const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3;
+  int32x4_t a = vld1q_s32(_a);
+  int16x8_t b = vld1q_s16(_b);
+  int16x8_t c = vld1q_s16(_c);
+  int32x4_t d;
+  int32_t _d[4];
+
+#define TEST_IMPL(IDX)                                     \
+  for (int i = 0; i < 4; i++) {                            \
+    _d[i] = sat_sub(_a[i], sat_dmull(_b[i + 4], _c[IDX])); \
+  }                                                        \
+  d = vqdmlsl_high_laneq_s16(a, b, c, IDX);                \
+  CHECK_RESULT(validate_int32(d, _d[0], _d[1], _d[2], _d[3]))
+
+  IMM_8_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
-result_t test_vqdmlsl_high_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
+result_t test_vqdmlsl_high_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
+#ifdef ENABLE_TEST_ALL
+  const int64_t *_a = (int64_t *)impl.test_cases_int_pointer1;
+  const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
+  const int32_t *_c = (int32_t *)impl.test_cases_int_pointer3;
+  int64x2_t a = vld1q_s64(_a);
+  int32x4_t b = vld1q_s32(_b);
+  int32x4_t c = vld1q_s32(_c);
+  int64x2_t d;
+  int64_t _d[2];
+
+#define TEST_IMPL(IDX)                                     \
+  for (int i = 0; i < 2; i++) {                            \
+    _d[i] = sat_sub(_a[i], sat_dmull(_b[i + 2], _c[IDX])); \
+  }                                                        \
+  d = vqdmlsl_high_laneq_s32(a, b, c, IDX);                \
+  CHECK_RESULT(validate_int64(d, _d[0], _d[1]))
+
+  IMM_4_ITER
+#undef TEST_IMPL
+
+  return TEST_SUCCESS;
+#else
+  return TEST_UNIMPL;
+#endif  // ENABLE_TEST_ALL
+}
 
 result_t test_vmull_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
 #ifdef ENABLE_TEST_ALL
diff --git a/tests/impl.h b/tests/impl.h
index 8a64259b..b10d4a90 100644
--- a/tests/impl.h
+++ b/tests/impl.h
@@ -1998,14 +1998,14 @@
   _(vqdmlsl_lane_s32)                                                            \
   _(vqdmlslh_lane_s16)                                                           \
   _(vqdmlsls_lane_s32)                                                           \
-  /*_(vqdmlsl_high_lane_s16)                                                  */ \
-  /*_(vqdmlsl_high_lane_s32)                                                  */ \
+  _(vqdmlsl_high_lane_s16)                                                       \
+  _(vqdmlsl_high_lane_s32)                                                       \
   _(vqdmlsl_laneq_s16)                                                           \
   _(vqdmlsl_laneq_s32)                                                           \
   _(vqdmlslh_laneq_s16)                                                          \
   _(vqdmlsls_laneq_s32)                                                          \
-  /*_(vqdmlsl_high_laneq_s16)                                                 */ \
-  /*_(vqdmlsl_high_laneq_s32)                                                 */ \
+  _(vqdmlsl_high_laneq_s16)                                                      \
+  _(vqdmlsl_high_laneq_s32)                                                      \
   _(vmull_lane_s16)                                                              \
   _(vmull_lane_s32)                                                              \
   _(vmull_lane_u16)                                                              \