Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/refactor noodle masked load (WIP) #216

Draft
wants to merge 10 commits into
base: develop
Choose a base branch
from
311 changes: 131 additions & 180 deletions src/hwlm/noodle_engine_simd.hpp

Large diffs are not rendered by default.

4 changes: 0 additions & 4 deletions src/util/arch/arm/match.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 cons
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
u32 pos = ctz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
Expand All @@ -54,7 +53,6 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const
if (vmax != 0) {
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
u32 pos = clz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
return buf + (15 - pos);
Expand All @@ -70,7 +68,6 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
if (vmax != 0) {
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
u32 pos = ctz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
Expand All @@ -90,7 +87,6 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
if (vmax != 0) {
typename SuperVector<16>::comparemask_type z = mask.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
u32 pos = clz64(z) / SuperVector<16>::mask_width();
DEBUG_PRINTF("match @ pos %u\n", pos);
return buf + (15 - pos);
Expand Down
18 changes: 7 additions & 11 deletions src/util/arch/ppc64el/match.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,10 @@ template <>
really_really_inline
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
if (unlikely(z)) {
u32 pos = ctz32(z);
DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
Expand All @@ -48,8 +47,7 @@ template <>
really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
if (unlikely(z)) {
u32 pos = clz32(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
Expand All @@ -64,11 +62,10 @@ template <>
really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
if (unlikely(z != 0xffff)) {
u32 pos = ctz32(~z & 0xffff);
DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
Expand All @@ -82,11 +79,10 @@ template <>
really_really_inline
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
if (unlikely(z != 0xffff)) {
u32 pos = clz32(~z & 0xffff);
DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos >= 16 && pos < 32);
return buf + (31 - pos);
Expand Down
20 changes: 8 additions & 12 deletions src/util/arch/x86/match.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,10 @@ really_really_inline
const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
assert(SuperVector<16>::mask_width() == 1);
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08x\n", buf, z);
if (unlikely(z)) {
u32 pos = ctz32(z);
DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
Expand Down Expand Up @@ -85,8 +84,7 @@ really_really_inline
const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
assert(SuperVector<16>::mask_width() == 1);
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
if (unlikely(z)) {
u32 pos = clz32(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
Expand Down Expand Up @@ -137,11 +135,10 @@ really_really_inline
const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
assert(SuperVector<16>::mask_width() == 1);
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
if (unlikely(z != 0xffff)) {
u32 pos = ctz32(~z & 0xffff);
DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos < 16);
return buf + pos;
Expand Down Expand Up @@ -174,7 +171,7 @@ const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 con
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask %016llx\n", mask);
z = ~z & mask;
DEBUG_PRINTF("z 0x%016llx\n", z);
DEBUG_PRINTF("z 0x%016llx\n", (u64a) z);
if (unlikely(z)) {
u32 pos = ctz64(z);
DEBUG_PRINTF("match @ pos %u\n", pos);
Expand All @@ -190,11 +187,10 @@ really_really_inline
const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
assert(SuperVector<16>::mask_width() == 1);
SuperVector<16>::comparemask_type z = v.comparemask();
DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
DEBUG_PRINTF("z %08llx\n", z);
DEBUG_PRINTF("buf %p z %08x \n", buf, z);
if (unlikely(z != 0xffff)) {
u32 pos = clz32(~z & 0xffffu);
DEBUG_PRINTF("~z %08llx\n", ~z);
DEBUG_PRINTF("~z %08x\n", ~z);
DEBUG_PRINTF("match @ pos %u\n", pos);
assert(pos >= 16 && pos < 32);
return buf + (31 - pos);
Expand Down
1 change: 1 addition & 0 deletions src/util/arch/x86/x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@

#if defined(__AVX512BW__) && defined(BUILD_AVX512)
#define HAVE_AVX512
#define HAVE_MASKED_LOADS
#define HAVE_SIMD_512_BITS
#endif

Expand Down
19 changes: 17 additions & 2 deletions src/util/supervector/arch/arm/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -525,11 +525,26 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
template <>
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
{
SuperVector mask = Ones_vshr(16 -len);
SuperVector<16> v = loadu(ptr);
SuperVector mask = Ones_vshr(16 - len);
SuperVector v = loadu(ptr);
return mask & v;
}

template <>
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
{
DEBUG_PRINTF("mask = %08llx\n", mask);
SuperVector v = loadu(ptr);
(void)mask;
return v; // FIXME: & mask

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FIXME

}

template<>
really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::findLSB(typename SuperVector<16>::comparemask_type &z)
{
return findAndClearLSB_64(&z) >> 2;
}

template<>
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
{
Expand Down
6 changes: 6 additions & 0 deletions src/util/supervector/arch/ppc64el/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,12 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint
return mask & v;
}

template<>
really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::findLSB(typename SuperVector<16>::comparemask_type &z)
{
return findAndClearLSB_32(&z);
}

template<>
really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
{
Expand Down
67 changes: 58 additions & 9 deletions src/util/supervector/arch/x86/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,28 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint
{
SuperVector mask = Ones_vshr(16 -len);
SuperVector v = _mm_loadu_si128((const m128 *)ptr);
return mask & v;
return v & mask;
}

template <>
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
{
#ifdef HAVE_AVX512
SuperVector<16> v = _mm_maskz_loadu_epi8(mask, (const m128 *)ptr);
v.print8("v");
return v;
#else
DEBUG_PRINTF("mask = %08x\n", mask);
SuperVector v = _mm_loadu_si128((const m128 *)ptr);
(void)mask;
return v; // FIXME: & mask
markos marked this conversation as resolved.
Show resolved Hide resolved
#endif
}

template<>
really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::findLSB(typename SuperVector<16>::comparemask_type &z)
{
return findAndClearLSB_32(&z);
}

template<>
Expand Down Expand Up @@ -1126,22 +1147,35 @@ really_inline SuperVector<32> SuperVector<32>::load(void const *ptr)
template <>
really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint8_t const len)
{
SuperVector mask = Ones_vshr(32 -len);
mask.print8("mask");
SuperVector<32> v = _mm256_loadu_si256((const m256 *)ptr);
v.print8("v");
return v & mask;
}

template <>
really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
{
DEBUG_PRINTF("mask = %08llx\n", mask);
#ifdef HAVE_AVX512
u32 mask = (~0ULL) >> (32 - len);
SuperVector<32> v = _mm256_mask_loadu_epi8(Zeroes().u.v256[0], mask, (const m256 *)ptr);
SuperVector<32> v = _mm256_maskz_loadu_epi8(mask, (const m256 *)ptr);
v.print8("v");
return v;
#else
DEBUG_PRINTF("len = %d", len);
SuperVector<32> mask = Ones_vshr(32 -len);
mask.print8("mask");
(Ones() >> (32 - len)).print8("mask");
SuperVector<32> v = _mm256_loadu_si256((const m256 *)ptr);
v.print8("v");
return mask & v;
(void)mask;
return v; // FIXME: & mask
markos marked this conversation as resolved.
Show resolved Hide resolved
#endif
}

template<>
really_inline typename SuperVector<32>::comparemask_type SuperVector<32>::findLSB(typename SuperVector<32>::comparemask_type &z)
{
return findAndClearLSB_64(&z);
}

template<>
really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset)
{
Expand Down Expand Up @@ -1778,11 +1812,26 @@ really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr, uint
{
u64a mask = (~0ULL) >> (64 - len);
DEBUG_PRINTF("mask = %016llx\n", mask);
SuperVector<64> v = _mm512_mask_loadu_epi8(Zeroes().u.v512[0], mask, (const m512 *)ptr);
SuperVector<64> v = _mm512_maskz_loadu_epi8(mask, (const m512 *)ptr);
v.print8("v");
return v;
}

template <>
really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
{
DEBUG_PRINTF("mask = %016llx\n", mask);
SuperVector<64> v = _mm512_maskz_loadu_epi8(mask, (const m512 *)ptr);
v.print8("v");
return v;
}

template<>
really_inline typename SuperVector<64>::comparemask_type SuperVector<64>::findLSB(typename SuperVector<64>::comparemask_type &z)
{
return findAndClearLSB_64(&z);
}

template<>
template<>
really_inline SuperVector<64> SuperVector<64>::pshufb<true>(SuperVector<64> b)
Expand Down
39 changes: 9 additions & 30 deletions src/util/supervector/supervector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,35 +46,7 @@
#endif
#endif // VS_SIMDE_BACKEND

#if defined(HAVE_SIMD_512_BITS)
using Z_TYPE = u64a;
#define Z_BITS 64
#define Z_SHIFT 63
#define Z_POSSHIFT 0
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS -(l)))
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#elif defined(HAVE_SIMD_256_BITS)
using Z_TYPE = u32;
#define Z_BITS 32
#define Z_SHIFT 31
#define Z_POSSHIFT 0
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#elif defined(HAVE_SIMD_128_BITS)
#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
using Z_TYPE = u64a;
#define Z_BITS 64
#define Z_POSSHIFT 2
#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l)))
#else
using Z_TYPE = u32;
#define Z_BITS 32
#define Z_POSSHIFT 0
#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#endif
#define Z_SHIFT 15
#define SINGLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
#endif
#include <util/bitutils.h>

// Define a common assume_aligned using an appropriate compiler built-in, if
// it's available. Note that we need to handle C or C++ compilation.
Expand Down Expand Up @@ -138,7 +110,7 @@ struct BaseVector<64>
static constexpr u16 previous_size = 32;
};

// 128 bit implementation
// 256 bit implementation
template <>
struct BaseVector<32>
{
Expand All @@ -158,7 +130,11 @@ struct BaseVector<16>
static constexpr bool is_valid = true;
static constexpr u16 size = 16;
using type = m128;
#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
using comparemask_type = u64a;
#else
using comparemask_type = u32;
#endif
static constexpr bool has_previous = false;
using previous_type = u64a;
static constexpr u16 previous_size = 8;
Expand Down Expand Up @@ -257,9 +233,12 @@ class SuperVector : public BaseVector<SIZE>
static typename base_type::comparemask_type
iteration_mask(typename base_type::comparemask_type mask);

static typename base_type::comparemask_type load_mask(uint8_t const len) { return (((1ULL) << (len)) - 1ULL); }
static typename base_type::comparemask_type findLSB(typename base_type::comparemask_type &z);
static SuperVector loadu(void const *ptr);
static SuperVector load(void const *ptr);
static SuperVector loadu_maskz(void const *ptr, uint8_t const len);
static SuperVector loadu_maskz(void const *ptr, typename base_type::comparemask_type const len);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you add the implementation for arm later on, but I didn't see any implementation for ppc64 ?

SuperVector alignr(SuperVector &other, int8_t offset);

template<bool emulateIntel=true>
Expand Down