From 1931a78291b0e3aa28887ba17d74fc3c9914215d Mon Sep 17 00:00:00 2001 From: goldsteinn <35538541+goldsteinn@users.noreply.github.com> Date: Tue, 10 Sep 2024 07:05:20 -0700 Subject: [PATCH] clarify bitset_extract_setbits scalar code (BLSI -> BLSR) (#659) The "BLSI" logic is essentially a complicated way to get to BLSR, so refactor to be more explicit. --- src/bitset_util.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/src/bitset_util.c b/src/bitset_util.c index 0ae7d9258..6bc12b44b 100644 --- a/src/bitset_util.c +++ b/src/bitset_util.c @@ -613,16 +613,13 @@ size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length, for (; (i < length) && (out < safeout); ++i) { uint64_t w = words[i]; while ((w != 0) && (out < safeout)) { - uint64_t t = - w & (~w + 1); // on x64, should compile to BLSI (careful: the - // Intel compiler seems to fail) int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT uint32_t val = r + base; memcpy(out, &val, sizeof(uint32_t)); // should be compiled as a MOV on x64 out++; - w ^= t; + w &= (w - 1); } base += 64; } @@ -667,15 +664,12 @@ size_t bitset_extract_setbits_avx512_uint16(const uint64_t *array, for (; (i < length) && (out < safeout); ++i) { uint64_t w = array[i]; while ((w != 0) && (out < safeout)) { - uint64_t t = - w & (~w + 1); // on x64, should compile to BLSI (careful: the - // Intel compiler seems to fail) int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT uint32_t val = r + base; memcpy(out, &val, sizeof(uint16_t)); out++; - w ^= t; + w &= (w - 1); } base += 64; } @@ -725,16 +719,13 @@ size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length, for (; (i < length) && (out < safeout); ++i) { uint64_t w = words[i]; while ((w != 0) && (out < safeout)) { - uint64_t t = - w & (~w + 1); // on x64, should compile to BLSI (careful: the - // Intel compiler seems to fail) int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT uint32_t val = r + base; memcpy(out, &val, sizeof(uint32_t)); // should be compiled as a MOV on x64 out++; - w ^= t; + w &= (w - 1); } base += 64; } @@ -749,16 +740,13 @@ size_t bitset_extract_setbits(const uint64_t *words, size_t length, for (size_t i = 0; i < length; ++i) { uint64_t w = words[i]; while (w != 0) { - uint64_t t = - w & (~w + 1); // on x64, should compile to BLSI (careful: the - // Intel compiler seems to fail) int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT uint32_t val = r + base; memcpy(out + outpos, &val, sizeof(uint32_t)); // should be compiled as a MOV on x64 outpos++; - w ^= t; + w &= (w - 1); } base += 64; } @@ -772,10 +760,9 @@ size_t bitset_extract_intersection_setbits_uint16( for (size_t i = 0; i < length; ++i) { uint64_t w = words1[i] & words2[i]; while (w != 0) { - uint64_t t = w & (~w + 1); int r = roaring_trailing_zeroes(w); out[outpos++] = (uint16_t)(r + base); - w ^= t; + w &= (w - 1); } base += 64; } @@ -836,11 +823,10 @@ size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length, for (; (i < length) && (out < safeout); ++i) { uint64_t w = words[i]; while ((w != 0) && (out < safeout)) { - uint64_t t = w & (~w + 1); int r = roaring_trailing_zeroes(w); *out = (uint16_t)(r + base); out++; - w ^= t; + w &= (w - 1); } base += 64; } @@ -864,10 +850,9 @@ size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length, for (size_t i = 0; i < length; ++i) { uint64_t w = words[i]; while (w != 0) { - uint64_t t = w & (~w + 1); int r = roaring_trailing_zeroes(w); out[outpos++] = (uint16_t)(r + base); - w ^= t; + w &= (w - 1); } base += 64; } @@ -1158,4 +1143,4 @@ void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length) { #endif #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic pop -#endif \ No newline at end of file +#endif