Skip to content

Commit

Permalink
Merge pull request #481 from howjmay/vqdmull_laneq
Browse files Browse the repository at this point in the history
feat: Add vqdmull_laneq_[s16|s32]
  • Loading branch information
howjmay authored Jul 30, 2024
2 parents f8d60e7 + f2d5504 commit d1f1aed
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 10 deletions.
20 changes: 14 additions & 6 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -9594,14 +9594,14 @@ FORCE_INLINE uint64x2_t vmull_high_laneq_u32(uint32x4_t a, uint32x4_t b, const i
return __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vwmulu_vv_u64m2(a_high, b_dup, 2));
}

FORCE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t a, int16x4_t b, const int c) {
vint16m1_t b_dup_lane = __riscv_vrgather_vx_i16m1(b, c, 4);
FORCE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t a, int16x4_t b, const int lane) {
vint16m1_t b_dup_lane = __riscv_vrgather_vx_i16m1(b, lane, 4);
vint32m2_t ab_mul = __riscv_vwmul_vv_i32m2(a, b_dup_lane, 4);
return __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsll_vx_i32m2(ab_mul, 1, 4));
}

FORCE_INLINE int64x2_t vqdmull_lane_s32(int32x2_t a, int32x2_t b, const int c) {
vint32m1_t b_dup_lane = __riscv_vrgather_vx_i32m1(b, c, 2);
FORCE_INLINE int64x2_t vqdmull_lane_s32(int32x2_t a, int32x2_t b, const int lane) {
vint32m1_t b_dup_lane = __riscv_vrgather_vx_i32m1(b, lane, 2);
vint64m2_t ab_mul = __riscv_vwmul_vv_i64m2(a, b_dup_lane, 2);
return __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsll_vx_i64m2(ab_mul, 1, 2));
}
Expand Down Expand Up @@ -9632,9 +9632,17 @@ FORCE_INLINE int64x2_t vqdmull_high_lane_s32(int32x4_t a, int32x2_t b, const int
return __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsll_vx_i64m2(ab_mul, 1, 2));
}

// FORCE_INLINE int32x4_t vqdmull_laneq_s16(int16x4_t a, int16x8_t b, const int lane);
FORCE_INLINE int32x4_t vqdmull_laneq_s16(int16x4_t a, int16x8_t b, const int lane) {
vint16m1_t b_dup_lane = __riscv_vrgather_vx_i16m1(b, lane, 8);
vint32m2_t ab_mul = __riscv_vwmul_vv_i32m2(a, b_dup_lane, 4);
return __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsll_vx_i32m2(ab_mul, 1, 4));
}

// FORCE_INLINE int64x2_t vqdmull_laneq_s32(int32x2_t a, int32x4_t b, const int lane);
FORCE_INLINE int64x2_t vqdmull_laneq_s32(int32x2_t a, int32x4_t b, const int lane) {
vint32m1_t b_dup_lane = __riscv_vrgather_vx_i32m1(b, lane, 4);
vint64m2_t ab_mul = __riscv_vwmul_vv_i64m2(a, b_dup_lane, 2);
return __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsll_vx_i64m2(ab_mul, 1, 2));
}

FORCE_INLINE int32_t vqdmullh_laneq_s16(int16_t a, int16x8_t b, const int lane) {
int16_t b_lane = vgetq_lane_s16(b, lane);
Expand Down
50 changes: 48 additions & 2 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35006,9 +35006,55 @@ result_t test_vqdmull_high_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t ite
#endif // ENABLE_TEST_ALL
}

result_t test_vqdmull_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vqdmull_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
int32_t _c[4];
int16x4_t a = vld1_s16(_a);
int16x8_t b = vld1q_s16(_b);
int32x4_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 4; i++) { \
_c[i] = sat_dmull(_a[i], _b[IDX]); \
} \
c = vqdmull_laneq_s16(a, b, IDX); \
CHECK_RESULT(validate_int32(c, _c[0], _c[1], _c[2], _c[3]))

IMM_8_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vqdmull_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
result_t test_vqdmull_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
#ifdef ENABLE_TEST_ALL
const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
int64_t _c[2];
int32x2_t a = vld1_s32(_a);
int32x4_t b = vld1q_s32(_b);
int64x2_t c;

#define TEST_IMPL(IDX) \
for (int i = 0; i < 2; i++) { \
_c[i] = sat_dmull(_a[i], _b[IDX]); \
} \
c = vqdmull_laneq_s32(a, b, IDX); \
CHECK_RESULT(validate_int64(c, _c[0], _c[1]))

IMM_4_ITER
#undef TEST_IMPL

return TEST_SUCCESS;
#else
return TEST_UNIMPL;
#endif // ENABLE_TEST_ALL
}

result_t test_vqdmullh_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }

Expand Down
4 changes: 2 additions & 2 deletions tests/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2028,8 +2028,8 @@
_(vqdmulls_lane_s32) \
_(vqdmull_high_lane_s16) \
_(vqdmull_high_lane_s32) \
/*_(vqdmull_laneq_s16) */ \
/*_(vqdmull_laneq_s32) */ \
_(vqdmull_laneq_s16) \
_(vqdmull_laneq_s32) \
_(vqdmullh_laneq_s16) \
_(vqdmulls_laneq_s32) \
_(vqdmull_high_laneq_s16) \
Expand Down

0 comments on commit d1f1aed

Please sign in to comment.