Skip to content

Commit

Permalink
Optimize scalar fallback
Browse files Browse the repository at this point in the history
  • Loading branch information
kg committed Apr 15, 2024
1 parent 4169252 commit 338b23c
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 14 deletions.
50 changes: 37 additions & 13 deletions src/native/containers/dn-simdhash-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,30 @@
// HACK: for better language server parsing
#include "dn-simdhash.h"

static DN_FORCEINLINE(int)
find_first_matching_suffix_scalar (uint8_t needle[DN_SIMDHASH_VECTOR_WIDTH], uint8_t haystack[DN_SIMDHASH_VECTOR_WIDTH], uint32_t count)
{
// TODO: It might be profitable to hand-unroll this loop, but right now doing so
// hits a bug in clang and generates really bad WASM.
for (uint32_t i = 0; i < count; i++)
if (needle[i] == haystack[i])
return i;

return 32;
}

static DN_FORCEINLINE(int)
find_first_matching_suffix_scalar_1 (uint8_t needle, uint8_t haystack[DN_SIMDHASH_VECTOR_WIDTH], uint32_t count)
{
// TODO: It might be profitable to hand-unroll this loop, but right now doing so
// hits a bug in clang and generates really bad WASM.
for (uint32_t i = 0; i < count; i++)
if (needle == haystack[i])
return i;

return 32;
}

#if defined(__clang__) || defined (__GNUC__) // use vector intrinsics

#if defined(__wasm_simd128__)
Expand All @@ -19,10 +43,14 @@
* #include <arm_neon.h>
*/
#elif defined(__wasm)
#ifdef DN_SIMDHASH_WARNINGS
#pragma message("WARNING: Building dn_simdhash for WASM without -msimd128! Performance will be terrible!")
#endif
#else
#ifdef DN_SIMDHASH_WARNINGS
#pragma message("WARNING: Unsupported architecture for dn_simdhash! Performance will be terrible!")
#endif
#endif

// extract/replace lane opcodes require constant indices on some target architectures,
// and in some cases it is profitable to do a single-byte memory load/store instead of
Expand Down Expand Up @@ -69,7 +97,7 @@ build_search_vector (uint8_t needle)

// returns an index in range 0-14 on match, 32 if no match
static DN_FORCEINLINE(uint32_t)
find_first_matching_suffix (dn_simdhash_suffixes needle, dn_simdhash_suffixes haystack)
find_first_matching_suffix (dn_simdhash_suffixes needle, dn_simdhash_suffixes haystack, uint32_t count)
{
#if defined(__wasm_simd128__)
return ctz(wasm_i8x16_bitmask(wasm_i8x16_eq(needle.vec, haystack.vec)));
Expand All @@ -95,11 +123,8 @@ find_first_matching_suffix (dn_simdhash_suffixes needle, dn_simdhash_suffixes ha
* return ctz(msb.u);
*/
#else
for (uint32_t i = 0, c = dn_simdhash_bucket_count(haystack); i < c; i++)
if (needle.values[i] == haystack.values[i])
return i;

return 32;
#define DN_SIMDHASH_USE_SCALAR_FALLBACK 1
return find_first_matching_suffix_scalar(needle.values, haystack.values, count);
#endif
}

Expand Down Expand Up @@ -141,7 +166,7 @@ build_search_vector (uint8_t needle)

// returns an index in range 0-14 on match, 32 if no match
static DN_FORCEINLINE(uint32_t)
find_first_matching_suffix (dn_simdhash_suffixes needle, dn_simdhash_suffixes haystack)
find_first_matching_suffix (dn_simdhash_suffixes needle, dn_simdhash_suffixes haystack, uint32_t count)
{
// FIXME: Completely untested.
__m128i match_vector = _mm_cmpeq_epi8(needle.m128, haystack.m128);
Expand All @@ -150,7 +175,9 @@ find_first_matching_suffix (dn_simdhash_suffixes needle, dn_simdhash_suffixes ha

#else // unknown compiler and/or unknown non-simd arch

#ifdef DN_SIMDHASH_WARNINGS
#pragma message("WARNING: Unsupported architecture/compiler for dn_simdhash! Performance will be terrible!")
#endif

typedef struct {
uint8_t values[DN_SIMDHASH_VECTOR_WIDTH];
Expand All @@ -167,13 +194,10 @@ build_search_vector (uint8_t needle)

// returns an index in range 0-14 on match, 32 if no match
static DN_FORCEINLINE(uint32_t)
find_first_matching_suffix (dn_simdhash_suffixes needle, dn_simdhash_suffixes haystack)
find_first_matching_suffix (dn_simdhash_suffixes needle, dn_simdhash_suffixes haystack, uint32_t count)
{
for (uint32_t i = 0, c = dn_simdhash_bucket_count(haystack); i < c; i++)
if (needle.values[i] == haystack.values[i])
return i;

return 32;
#define DN_SIMDHASH_USE_SCALAR_FALLBACK 1
return find_first_matching_suffix_scalar(needle.values, haystack.values, count);
}

#endif // end of clang/gcc or msvc or fallback
Expand Down
9 changes: 8 additions & 1 deletion src/native/containers/dn-simdhash-specialization.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,14 @@ static DN_FORCEINLINE(int)
DN_SIMDHASH_SCAN_BUCKET_INTERNAL (bucket_t *bucket, DN_SIMDHASH_KEY_T needle, dn_simdhash_suffixes search_vector)
{
uint32_t count = dn_simdhash_bucket_count(bucket->suffixes),
index = find_first_matching_suffix(search_vector, bucket->suffixes);
#if DN_SIMDHASH_USE_SCALAR_FALLBACK
// HACK: This allows the creation of the search_vector in our caller to be optimized out,
// and allows the search to compare each lane against a single scalar instead of having to
// compare two search vectors lane-by-lane.
index = find_first_matching_suffix_scalar_1(search_vector.values[0], bucket->suffixes.values, count);
#else
index = find_first_matching_suffix(search_vector, bucket->suffixes, count);
#endif
DN_SIMDHASH_KEY_T *key = &bucket->keys[index];

for (; index < count; index++, key++) {
Expand Down

0 comments on commit 338b23c

Please sign in to comment.