VectorCamp · markos · Dec 14, 2023 · Dec 18, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/src/hwlm/noodle_engine_simd.hpp b/src/hwlm/noodle_engine_simd.hpp
diff --git a/src/util/arch/arm/match.hpp b/src/util/arch/arm/match.hpp
@@ -34,7 +34,6 @@ const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 cons
     uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
     if (vmax != 0) {
         typename SuperVector<16>::comparemask_type z = mask.comparemask();
-        DEBUG_PRINTF("z %08llx\n", z);
         DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
         u32 pos = ctz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
@@ -54,7 +53,6 @@ const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const
     if (vmax != 0) {
         typename SuperVector<16>::comparemask_type z = mask.comparemask();
         DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
-        DEBUG_PRINTF("z %08llx\n", z);
         u32 pos = clz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
         return buf + (15 - pos);
@@ -70,7 +68,6 @@ const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
     uint64_t vmax = vgetq_lane_u64 (vreinterpretq_u64_u32 (vpmaxq_u32(m, m)), 0);
     if (vmax != 0) {
         typename SuperVector<16>::comparemask_type z = mask.comparemask();
-        DEBUG_PRINTF("z %08llx\n", z);
         DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
         u32 pos = ctz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
@@ -90,7 +87,6 @@ const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16
     if (vmax != 0) {
         typename SuperVector<16>::comparemask_type z = mask.comparemask();
         DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
-        DEBUG_PRINTF("z %08llx\n", z);
         u32 pos = clz64(z) / SuperVector<16>::mask_width();
         DEBUG_PRINTF("match @ pos %u\n", pos);
         return buf + (15 - pos);

diff --git a/src/util/arch/ppc64el/match.hpp b/src/util/arch/ppc64el/match.hpp
@@ -31,11 +31,10 @@ template <>
 really_really_inline
 const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
-    DEBUG_PRINTF("z %08llx\n", z);
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     if (unlikely(z)) {
         u32 pos = ctz32(z);
-        DEBUG_PRINTF("~z %08llx\n", ~z);
+        DEBUG_PRINTF("~z %08x\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         return buf + pos;
@@ -48,8 +47,7 @@ template <>
 really_really_inline
 const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
-    DEBUG_PRINTF("z %08llx\n", z);
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     if (unlikely(z)) {
         u32 pos = clz32(z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
@@ -64,11 +62,10 @@ template <>
 really_really_inline
 const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
-    DEBUG_PRINTF("z %08llx\n", z);
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     if (unlikely(z != 0xffff)) {
         u32 pos = ctz32(~z & 0xffff);
-        DEBUG_PRINTF("~z %08llx\n", ~z);
+        DEBUG_PRINTF("~z %08x\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         return buf + pos;
@@ -82,11 +79,10 @@ template <>
 really_really_inline
 const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
-    DEBUG_PRINTF("z %08llx\n", z);
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     if (unlikely(z != 0xffff)) {
         u32 pos = clz32(~z & 0xffff);
-        DEBUG_PRINTF("~z %08llx\n", ~z);
+        DEBUG_PRINTF("~z %08x\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos >= 16 && pos < 32);
         return buf + (31 - pos);

diff --git a/src/util/arch/x86/match.hpp b/src/util/arch/x86/match.hpp
@@ -32,11 +32,10 @@ really_really_inline
 const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
     assert(SuperVector<16>::mask_width() == 1);
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
-    DEBUG_PRINTF("z %08llx\n", z);
+    DEBUG_PRINTF("buf %p z %08x\n", buf, z);
     if (unlikely(z)) {
         u32 pos = ctz32(z);
-        DEBUG_PRINTF("~z %08llx\n", ~z);
+        DEBUG_PRINTF("~z %08x\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         return buf + pos;
@@ -85,8 +84,7 @@ really_really_inline
 const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
     assert(SuperVector<16>::mask_width() == 1);
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
-    DEBUG_PRINTF("z %08llx\n", z);
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     if (unlikely(z)) {
         u32 pos = clz32(z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
@@ -137,11 +135,10 @@ really_really_inline
 const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, u16 const UNUSED len) {
     assert(SuperVector<16>::mask_width() == 1);
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
-    DEBUG_PRINTF("z %08llx\n", z);
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     if (unlikely(z != 0xffff)) {
         u32 pos = ctz32(~z & 0xffff);
-        DEBUG_PRINTF("~z %08llx\n", ~z);
+        DEBUG_PRINTF("~z %08x\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos < 16);
         return buf + pos;
@@ -174,7 +171,7 @@ const u8 *first_zero_match_inverted<64>(const u8 *buf, SuperVector<64>v, u16 con
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask %016llx\n", mask);
     z = ~z & mask;
-    DEBUG_PRINTF("z 0x%016llx\n", z);
+    DEBUG_PRINTF("z 0x%016llx\n", (u64a) z);
     if (unlikely(z)) {
         u32 pos = ctz64(z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
@@ -190,11 +187,10 @@ really_really_inline
 const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> v, uint16_t UNUSED len ) {
     assert(SuperVector<16>::mask_width() == 1);
     SuperVector<16>::comparemask_type z = v.comparemask();
-    DEBUG_PRINTF("buf %p z %08llx \n", buf, z);
-    DEBUG_PRINTF("z %08llx\n", z);
+    DEBUG_PRINTF("buf %p z %08x \n", buf, z);
     if (unlikely(z != 0xffff)) {
         u32 pos = clz32(~z & 0xffffu);
-        DEBUG_PRINTF("~z %08llx\n", ~z);
+        DEBUG_PRINTF("~z %08x\n", ~z);
         DEBUG_PRINTF("match @ pos %u\n", pos);
         assert(pos >= 16 && pos < 32);
         return buf + (31 - pos);

diff --git a/src/util/arch/x86/x86.h b/src/util/arch/x86/x86.h
@@ -61,6 +61,7 @@
 
 #if defined(__AVX512BW__) && defined(BUILD_AVX512)
 #define HAVE_AVX512
+#define HAVE_MASKED_LOADS
 #define HAVE_SIMD_512_BITS
 #endif
 

diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp
@@ -525,11 +525,26 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
 template <>
 really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
 {
-    SuperVector mask = Ones_vshr(16 -len);
-    SuperVector<16> v = loadu(ptr);
+    SuperVector mask = Ones_vshr(16 - len);
+    SuperVector v = loadu(ptr);
     return mask & v;
 }
 
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
+{
+    DEBUG_PRINTF("mask = %08llx\n", mask);
+    SuperVector v = loadu(ptr);
+    (void)mask;
+    return v; // FIXME: & mask
+}
+
+template<>
+really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::findLSB(typename SuperVector<16>::comparemask_type &z)
+{
+  return findAndClearLSB_64(&z) >> 2;
+}
+
 template<>
 really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
 {

diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -555,6 +555,12 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint
     return mask & v;
 }
 
+template<>
+really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::findLSB(typename SuperVector<16>::comparemask_type &z)
+{
+  return findAndClearLSB_32(&z);
+}
+
 template<>
 really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
 {   

diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp
@@ -524,7 +524,28 @@ really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint
 {
     SuperVector mask = Ones_vshr(16 -len);
     SuperVector v = _mm_loadu_si128((const m128 *)ptr);
-    return mask & v;
+    return v & mask;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
+{
+#ifdef HAVE_AVX512
+    SuperVector<16> v = _mm_maskz_loadu_epi8(mask, (const m128 *)ptr);
+    v.print8("v");
+    return v;
+#else
+    DEBUG_PRINTF("mask = %08x\n", mask);
+    SuperVector v = _mm_loadu_si128((const m128 *)ptr);
+    (void)mask;
+    return v; // FIXME: & mask
+#endif
+}
+
+template<>
+really_inline typename SuperVector<16>::comparemask_type SuperVector<16>::findLSB(typename SuperVector<16>::comparemask_type &z)
+{
+  return findAndClearLSB_32(&z);
 }
 
 template<>
@@ -1126,22 +1147,35 @@ really_inline SuperVector<32> SuperVector<32>::load(void const *ptr)
 template <>
 really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint8_t const len)
 {
+    SuperVector mask = Ones_vshr(32 -len);
+    mask.print8("mask");
+    SuperVector<32> v = _mm256_loadu_si256((const m256 *)ptr);
+    v.print8("v");
+    return v & mask;
+}
+
+template <>
+really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
+{
+    DEBUG_PRINTF("mask = %08llx\n", mask);
 #ifdef HAVE_AVX512
-    u32 mask = (~0ULL) >> (32 - len);
-    SuperVector<32> v = _mm256_mask_loadu_epi8(Zeroes().u.v256[0], mask, (const m256 *)ptr);
+    SuperVector<32> v = _mm256_maskz_loadu_epi8(mask, (const m256 *)ptr);
     v.print8("v");
     return v;
 #else
-    DEBUG_PRINTF("len = %d", len);
-    SuperVector<32> mask = Ones_vshr(32 -len);
-    mask.print8("mask");
-    (Ones() >> (32 - len)).print8("mask");
     SuperVector<32> v = _mm256_loadu_si256((const m256 *)ptr);
     v.print8("v");
-    return mask & v;
+    (void)mask;
+    return v; // FIXME: & mask
 #endif
 }
 
+template<>
+really_inline typename SuperVector<32>::comparemask_type SuperVector<32>::findLSB(typename SuperVector<32>::comparemask_type &z)
+{
+  return findAndClearLSB_64(&z);
+}
+
 template<>
 really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset)
 {
@@ -1778,11 +1812,26 @@ really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr, uint
 {
     u64a mask = (~0ULL) >> (64 - len);
     DEBUG_PRINTF("mask = %016llx\n", mask);
-    SuperVector<64> v = _mm512_mask_loadu_epi8(Zeroes().u.v512[0], mask, (const m512 *)ptr);
+    SuperVector<64> v = _mm512_maskz_loadu_epi8(mask, (const m512 *)ptr);
     v.print8("v");
     return v;
 }
 
+template <>
+really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr, typename base_type::comparemask_type const mask)
+{
+    DEBUG_PRINTF("mask = %016llx\n", mask);
+    SuperVector<64> v = _mm512_maskz_loadu_epi8(mask, (const m512 *)ptr);
+    v.print8("v");
+    return v;
+}
+
+template<>
+really_inline typename SuperVector<64>::comparemask_type SuperVector<64>::findLSB(typename SuperVector<64>::comparemask_type &z)
+{
+  return findAndClearLSB_64(&z);
+}
+
 template<>
 template<>
 really_inline SuperVector<64> SuperVector<64>::pshufb<true>(SuperVector<64> b)

diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
@@ -46,35 +46,7 @@
 #endif
 #endif // VS_SIMDE_BACKEND
 
-#if defined(HAVE_SIMD_512_BITS)
-using Z_TYPE = u64a;
-#define Z_BITS 64
-#define Z_SHIFT 63
-#define Z_POSSHIFT 0
-#define DOUBLE_LOAD_MASK(l)        ((~0ULL) >> (Z_BITS -(l)))
-#define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
-#elif defined(HAVE_SIMD_256_BITS)
-using Z_TYPE = u32;
-#define Z_BITS 32
-#define Z_SHIFT 31
-#define Z_POSSHIFT 0
-#define DOUBLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
-#define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
-#elif defined(HAVE_SIMD_128_BITS)
-#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
-using Z_TYPE = u64a;
-#define Z_BITS 64
-#define Z_POSSHIFT 2
-#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l)))
-#else
-using Z_TYPE = u32;
-#define Z_BITS 32
-#define Z_POSSHIFT 0
-#define DOUBLE_LOAD_MASK(l) (((1ULL) << (l)) - 1ULL)
-#endif
-#define Z_SHIFT 15
-#define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
-#endif
+#include <util/bitutils.h>
 
 // Define a common assume_aligned using an appropriate compiler built-in, if
 // it's available. Note that we need to handle C or C++ compilation.
@@ -138,7 +110,7 @@ struct BaseVector<64>
   static constexpr u16  previous_size = 32;
 };
 
-// 128 bit implementation
+// 256 bit implementation
 template <>
 struct BaseVector<32>
 {
@@ -158,7 +130,11 @@ struct BaseVector<16>
   static constexpr bool      is_valid = true;
   static constexpr u16           size = 16;
   using                          type = m128;
+#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
   using              comparemask_type = u64a;
+#else
+  using              comparemask_type = u32;
+#endif
   static constexpr bool  has_previous = false;
   using                 previous_type = u64a;
   static constexpr u16  previous_size = 8;
@@ -257,9 +233,12 @@ class SuperVector : public BaseVector<SIZE>
   static typename base_type::comparemask_type
   iteration_mask(typename base_type::comparemask_type mask);
 
+  static typename base_type::comparemask_type load_mask(uint8_t const len) { return (((1ULL) << (len)) - 1ULL); }
+  static typename base_type::comparemask_type findLSB(typename base_type::comparemask_type &z);
   static SuperVector loadu(void const *ptr);
   static SuperVector load(void const *ptr);
   static SuperVector loadu_maskz(void const *ptr, uint8_t const len);
+  static SuperVector loadu_maskz(void const *ptr, typename base_type::comparemask_type const len);
   SuperVector alignr(SuperVector &other, int8_t offset);
 
   template<bool emulateIntel=true>