Skip to content

Commit

Permalink
clarify bitset_extract_setbits scalar code (BLSI -> BLSR) (#659)
Browse files Browse the repository at this point in the history
The "BLSI" logic is essentially a complicated way to get to BLSR, so
refactor to be more explicit.
  • Loading branch information
goldsteinn authored Sep 10, 2024
1 parent 955c769 commit 1931a78
Showing 1 changed file with 8 additions and 23 deletions.
31 changes: 8 additions & 23 deletions src/bitset_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -613,16 +613,13 @@ size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length,
for (; (i < length) && (out < safeout); ++i) {
uint64_t w = words[i];
while ((w != 0) && (out < safeout)) {
uint64_t t =
w & (~w + 1); // on x64, should compile to BLSI (careful: the
// Intel compiler seems to fail)
int r =
roaring_trailing_zeroes(w); // on x64, should compile to TZCNT
uint32_t val = r + base;
memcpy(out, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
out++;
w ^= t;
w &= (w - 1);
}
base += 64;
}
Expand Down Expand Up @@ -667,15 +664,12 @@ size_t bitset_extract_setbits_avx512_uint16(const uint64_t *array,
for (; (i < length) && (out < safeout); ++i) {
uint64_t w = array[i];
while ((w != 0) && (out < safeout)) {
uint64_t t =
w & (~w + 1); // on x64, should compile to BLSI (careful: the
// Intel compiler seems to fail)
int r =
roaring_trailing_zeroes(w); // on x64, should compile to TZCNT
uint32_t val = r + base;
memcpy(out, &val, sizeof(uint16_t));
out++;
w ^= t;
w &= (w - 1);
}
base += 64;
}
Expand Down Expand Up @@ -725,16 +719,13 @@ size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length,
for (; (i < length) && (out < safeout); ++i) {
uint64_t w = words[i];
while ((w != 0) && (out < safeout)) {
uint64_t t =
w & (~w + 1); // on x64, should compile to BLSI (careful: the
// Intel compiler seems to fail)
int r =
roaring_trailing_zeroes(w); // on x64, should compile to TZCNT
uint32_t val = r + base;
memcpy(out, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
out++;
w ^= t;
w &= (w - 1);
}
base += 64;
}
Expand All @@ -749,16 +740,13 @@ size_t bitset_extract_setbits(const uint64_t *words, size_t length,
for (size_t i = 0; i < length; ++i) {
uint64_t w = words[i];
while (w != 0) {
uint64_t t =
w & (~w + 1); // on x64, should compile to BLSI (careful: the
// Intel compiler seems to fail)
int r =
roaring_trailing_zeroes(w); // on x64, should compile to TZCNT
uint32_t val = r + base;
memcpy(out + outpos, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
outpos++;
w ^= t;
w &= (w - 1);
}
base += 64;
}
Expand All @@ -772,10 +760,9 @@ size_t bitset_extract_intersection_setbits_uint16(
for (size_t i = 0; i < length; ++i) {
uint64_t w = words1[i] & words2[i];
while (w != 0) {
uint64_t t = w & (~w + 1);
int r = roaring_trailing_zeroes(w);
out[outpos++] = (uint16_t)(r + base);
w ^= t;
w &= (w - 1);
}
base += 64;
}
Expand Down Expand Up @@ -836,11 +823,10 @@ size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length,
for (; (i < length) && (out < safeout); ++i) {
uint64_t w = words[i];
while ((w != 0) && (out < safeout)) {
uint64_t t = w & (~w + 1);
int r = roaring_trailing_zeroes(w);
*out = (uint16_t)(r + base);
out++;
w ^= t;
w &= (w - 1);
}
base += 64;
}
Expand All @@ -864,10 +850,9 @@ size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length,
for (size_t i = 0; i < length; ++i) {
uint64_t w = words[i];
while (w != 0) {
uint64_t t = w & (~w + 1);
int r = roaring_trailing_zeroes(w);
out[outpos++] = (uint16_t)(r + base);
w ^= t;
w &= (w - 1);
}
base += 64;
}
Expand Down Expand Up @@ -1158,4 +1143,4 @@ void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length) {
#endif
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
#endif

0 comments on commit 1931a78

Please sign in to comment.