-
Notifications
You must be signed in to change notification settings - Fork 57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/refactor noodle masked load (WIP) #216
base: develop
Are you sure you want to change the base?
Changes from 1 commit
d4fde85
9f66822
476cefb
5f65b9f
0e2f6c1
5814d32
db3b0e9
f866b72
de66c74
9a53b19
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
/* | ||
* Copyright (c) 2017, Intel Corporation | ||
* Copyright (c) 2020-2021, VectorCamp PC | ||
* Copyright (c) 2020-2023, VectorCamp PC | ||
* | ||
* Redistribution and use in source and binary forms, with or without | ||
* modification, are permitted provided that the following conditions are met: | ||
|
@@ -34,7 +34,7 @@ | |
|
||
static really_really_inline | ||
hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, | ||
Z_TYPE z, size_t len, const struct cb_info *cbi) { | ||
Z_TYPE z, size_t len, const struct cb_info *cbi) { | ||
while (unlikely(z)) { | ||
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT; | ||
size_t matchPos = d - buf + pos; | ||
|
@@ -47,9 +47,10 @@ hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, | |
|
||
static really_really_inline | ||
hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, | ||
Z_TYPE z, size_t len, const struct cb_info *cbi) { | ||
Z_TYPE z, size_t len, const struct cb_info *cbi) { | ||
while (unlikely(z)) { | ||
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT; | ||
DEBUG_PRINTF("pos %u\n", pos); | ||
size_t matchPos = d - buf + pos - 1; | ||
DEBUG_PRINTF("match pos %zu\n", matchPos); | ||
hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos); | ||
|
@@ -58,167 +59,67 @@ hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf, | |
return HWLM_SUCCESS; | ||
} | ||
|
||
|
||
template<uint16_t S> | ||
static really_inline | ||
hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, | ||
SuperVector<S> caseMask, SuperVector<S> mask1, | ||
const struct cb_info *cbi, size_t len, size_t start, | ||
size_t end) { | ||
const u8 *d = buf + start; | ||
DEBUG_PRINTF("start %zu end %zu\n", start, end); | ||
const size_t l = end - start; | ||
DEBUG_PRINTF("l = %ld\n", l); | ||
//assert(l <= 64); | ||
if (!l) { | ||
return HWLM_SUCCESS; | ||
} | ||
|
||
SuperVector<S> v = SuperVector<S>::Zeroes(); | ||
memcpy(&v.u, d, l); | ||
|
||
typename SuperVector<S>::comparemask_type mask = | ||
SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width()); | ||
v = v & caseMask; | ||
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v); | ||
z = SuperVector<S>::iteration_mask(z); | ||
|
||
return single_zscan(n, d, buf, z, len, cbi); | ||
} | ||
|
||
// The short scan routine. It is used both to scan data up to an | ||
// alignment boundary if needed and to finish off data that the aligned scan | ||
// function can't handle (due to small/unaligned chunk at end) | ||
template<uint16_t S> | ||
static really_inline | ||
hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, | ||
SuperVector<S> caseMask, SuperVector<S> mask1, | ||
const struct cb_info *cbi, size_t len, size_t offset, | ||
size_t start, | ||
size_t end) { | ||
const u8 *d = buf + offset; | ||
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); | ||
const size_t l = end - start; | ||
DEBUG_PRINTF("l = %ld\n", l); | ||
assert(l <= 64); | ||
if (!l) { | ||
return HWLM_SUCCESS; | ||
} | ||
size_t buf_off = start - offset; | ||
typename SuperVector<S>::comparemask_type mask = | ||
SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width()) | ||
<< (buf_off * SuperVector<S>::mask_width()); | ||
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask; | ||
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v); | ||
z = SuperVector<S>::iteration_mask(z); | ||
|
||
return single_zscan(n, d, buf, z, len, cbi); | ||
} | ||
|
||
template<uint16_t S> | ||
static really_inline | ||
hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, | ||
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2, | ||
const struct cb_info *cbi, size_t len, size_t start, size_t end) { | ||
const u8 *d = buf + start; | ||
DEBUG_PRINTF("start %zu end %zu\n", start, end); | ||
const size_t l = end - start; | ||
assert(l <= S); | ||
if (!l) { | ||
return HWLM_SUCCESS; | ||
} | ||
SuperVector<S> v = SuperVector<S>::Zeroes(); | ||
memcpy(&v.u, d, l); | ||
v = v & caseMask; | ||
|
||
typename SuperVector<S>::comparemask_type mask = | ||
DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width()); | ||
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v); | ||
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v); | ||
typename SuperVector<S>::comparemask_type z = | ||
mask & (z1 << (SuperVector<S>::mask_width())) & z2; | ||
z = SuperVector<S>::iteration_mask(z); | ||
|
||
return double_zscan(n, d, buf, z, len, cbi); | ||
} | ||
|
||
template<uint16_t S> | ||
static really_inline | ||
hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, | ||
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2, | ||
const struct cb_info *cbi, size_t len, size_t offset, size_t start, size_t end) { | ||
const u8 *d = buf + offset; | ||
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); | ||
const size_t l = end - start; | ||
assert(l <= S); | ||
if (!l) { | ||
return HWLM_SUCCESS; | ||
} | ||
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask; | ||
size_t buf_off = start - offset; | ||
typename SuperVector<S>::comparemask_type mask = | ||
DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width()) | ||
<< (buf_off * SuperVector<S>::mask_width()); | ||
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v); | ||
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v); | ||
typename SuperVector<S>::comparemask_type z = | ||
mask & (z1 << SuperVector<S>::mask_width()) & z2; | ||
z = SuperVector<S>::iteration_mask(z); | ||
|
||
return double_zscan(n, d, buf, z, len, cbi); | ||
} | ||
|
||
template <uint16_t S> | ||
static really_inline | ||
hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, | ||
size_t len, size_t offset, | ||
SuperVector<S> caseMask, SuperVector<S> mask1, | ||
const struct cb_info *cbi) { | ||
size_t start = offset + n->msk_len - 1; | ||
size_t end = len; | ||
|
||
const u8 *d = buf + start; | ||
const u8 *e = buf + end; | ||
DEBUG_PRINTF("start %p end %p \n", d, e); | ||
assert(d < e); | ||
if (e - d < S) { | ||
return scanSingleShort(n, buf, caseMask, mask1, cbi, len, start, end); | ||
} | ||
if (d + S <= e) { | ||
// peel off first part to cacheline boundary | ||
const u8 *d1 = ROUNDUP_PTR(d, S); | ||
DEBUG_PRINTF("until aligned %p \n", d1); | ||
if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) { | ||
return HWLM_TERMINATED; | ||
} | ||
d = d1; | ||
const u8 *buf_end = buf + len; | ||
assert(d < buf_end); | ||
|
||
DEBUG_PRINTF("noodle %p start %zu len %zu\n", buf, start, buf_end - buf); | ||
DEBUG_PRINTF("b %s\n", buf); | ||
DEBUG_PRINTF("start %p end %p \n", d, buf_end); | ||
|
||
__builtin_prefetch(d + 16*64); | ||
assert(d < buf_end); | ||
if (d + S <= buf_end) { | ||
// Reach vector aligned boundaries | ||
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); | ||
if (!ISALIGNED_N(d, S)) { | ||
const u8 *d1 = ROUNDUP_PTR(d, S); | ||
DEBUG_PRINTF("d1 - d: %ld \n", d1 - d); | ||
size_t l = d1 - d; | ||
SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask; | ||
typename SuperVector<S>::comparemask_type mask = SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width()); | ||
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(chars); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you forgot the iteration_mask(z); here ? I'm not sure what's its purpose, but it was there in the previous code, and is also there in the double scan path. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again, this is WIP, there is uncommitted code that I need to fix. iteration_mask is a way to reproduce the movemask functionality on Intel, it performs a different way in each architecture. |
||
|
||
size_t loops = (end - (d - buf)) / S; | ||
DEBUG_PRINTF("loops %ld \n", loops); | ||
hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi); | ||
RETURN_IF_TERMINATED(rv); | ||
d = d1; | ||
} | ||
|
||
for (size_t i = 0; i < loops; i++, d+= S) { | ||
while(d + S <= buf_end) { | ||
__builtin_prefetch(d + 16*64); | ||
DEBUG_PRINTF("d %p \n", d); | ||
const u8 *base = ROUNDUP_PTR(d, 64); | ||
// On large packet buffers, this prefetch appears to get us about 2%. | ||
__builtin_prefetch(base + 256); | ||
|
||
SuperVector<S> v = SuperVector<S>::load(d) & caseMask; | ||
typename SuperVector<S>::comparemask_type z = mask1.eqmask(v); | ||
z = SuperVector<S>::iteration_mask(z); | ||
|
||
hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi); | ||
RETURN_IF_TERMINATED(rv); | ||
d += S; | ||
} | ||
} | ||
|
||
DEBUG_PRINTF("d %p e %p \n", d, e); | ||
DEBUG_PRINTF("d %p e %p \n", d, buf_end); | ||
// finish off tail | ||
size_t s2End = ROUNDDOWN_PTR(e, S) - buf; | ||
if (s2End == end) { | ||
return HWLM_SUCCESS; | ||
|
||
if (d != buf_end) { | ||
SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask; | ||
size_t l = buf_end - d; | ||
typename SuperVector<S>::comparemask_type mask = SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width()); | ||
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(chars); | ||
hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing the iteration_mask(z); here too? |
||
RETURN_IF_TERMINATED(rv); | ||
} | ||
|
||
return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, end - S, s2End, len); | ||
return HWLM_SUCCESS; | ||
} | ||
|
||
template <uint16_t S> | ||
|
@@ -227,66 +128,84 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, | |
size_t len, size_t offset, | ||
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2, | ||
const struct cb_info *cbi) { | ||
// we stop scanning for the key-fragment when the rest of the key can't | ||
// possibly fit in the remaining buffer | ||
size_t end = len - n->key_offset + 2; | ||
|
||
size_t start = offset + n->msk_len - n->key_offset; | ||
|
||
const u8 *d = buf + start; | ||
const u8 *buf_end = buf + end; | ||
assert(d < buf_end); | ||
|
||
DEBUG_PRINTF("noodle %p start %zu len %zu\n", buf, start, buf_end - buf); | ||
DEBUG_PRINTF("b %s\n", buf); | ||
DEBUG_PRINTF("start %p end %p \n", d, buf_end); | ||
|
||
typename SuperVector<S>::comparemask_type lastz1{0}; | ||
|
||
const u8 *d = buf + start; | ||
const u8 *e = buf + end; | ||
DEBUG_PRINTF("start %p end %p \n", d, e); | ||
assert(d < e); | ||
if (e - d < S) { | ||
return scanDoubleShort(n, buf, caseMask, mask1, mask2, cbi, len, d - buf, end); | ||
} | ||
if (d + S <= e) { | ||
// peel off first part to cacheline boundary | ||
const u8 *d1 = ROUNDUP_PTR(d, S) + 1; | ||
DEBUG_PRINTF("until aligned %p \n", d1); | ||
if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) { | ||
return HWLM_TERMINATED; | ||
} | ||
d = d1 - 1; | ||
__builtin_prefetch(d + 16*64); | ||
assert(d < buf_end); | ||
if (d + S <= buf_end) { | ||
// Reach vector aligned boundaries | ||
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); | ||
if (!ISALIGNED_N(d, S)) { | ||
const u8 *d1 = ROUNDUP_PTR(d, S); | ||
size_t l = d1 - d; | ||
SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask; | ||
typename SuperVector<S>::comparemask_type mask = DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width()); | ||
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars); | ||
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars); | ||
typename SuperVector<S>::comparemask_type z = mask & (z1 << SuperVector<S>::mask_width()) & z2; | ||
lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width()); | ||
z = SuperVector<S>::iteration_mask(z); | ||
|
||
size_t loops = (end - (d - buf)) / S; | ||
DEBUG_PRINTF("loops %ld \n", loops); | ||
hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi); | ||
RETURN_IF_TERMINATED(rv); | ||
d = d1; | ||
} | ||
|
||
for (size_t i = 0; i < loops; i++, d+= S) { | ||
while(d + S <= buf_end) { | ||
__builtin_prefetch(d + 16*64); | ||
DEBUG_PRINTF("d %p \n", d); | ||
const u8 *base = ROUNDUP_PTR(d, 64); | ||
// On large packet buffers, this prefetch appears to get us about 2%. | ||
__builtin_prefetch(base + 256); | ||
|
||
SuperVector<S> v = SuperVector<S>::load(d) & caseMask; | ||
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v); | ||
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v); | ||
typename SuperVector<S>::comparemask_type z = | ||
(z1 << SuperVector<S>::mask_width() | lastz1) & z2; | ||
SuperVector<S> chars = SuperVector<S>::load(d) & caseMask; | ||
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars); | ||
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars); | ||
typename SuperVector<S>::comparemask_type z = (z1 << SuperVector<S>::mask_width() | lastz1) & z2; | ||
lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width()); | ||
z = SuperVector<S>::iteration_mask(z); | ||
|
||
hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi); | ||
RETURN_IF_TERMINATED(rv); | ||
} | ||
if (loops == 0) { | ||
d = d1; | ||
d += S; | ||
} | ||
} | ||
|
||
DEBUG_PRINTF("d %p e %p \n", d, buf_end); | ||
// finish off tail | ||
size_t s2End = ROUNDDOWN_PTR(e, S) - buf; | ||
if (s2End == end) { | ||
return HWLM_SUCCESS; | ||
|
||
if (d != buf_end) { | ||
size_t l = buf_end - d; | ||
SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask; | ||
typename SuperVector<S>::comparemask_type mask = DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width()); | ||
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars); | ||
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars); | ||
typename SuperVector<S>::comparemask_type z = mask & (z1 << SuperVector<S>::mask_width() | lastz1) & z2; | ||
z = SuperVector<S>::iteration_mask(z); | ||
|
||
hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi); | ||
RETURN_IF_TERMINATED(rv); | ||
} | ||
return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, end - S, d - buf, end); | ||
|
||
return HWLM_SUCCESS; | ||
} | ||
|
||
// Single-character specialisation, used when keyLen = 1 | ||
static really_inline | ||
hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len, | ||
size_t start, bool noCase, const struct cb_info *cbi) { | ||
/* if (len < VECTORSIZE) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Commented code. Shouldn't it be removed? |
||
return scanSingleSlow(n, buf, len, start, noCase, n->key0, cbi); | ||
}*/ | ||
|
||
if (!ourisalpha(n->key0)) { | ||
noCase = 0; // force noCase off if we don't have an alphabetic char | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I understand, this clear a single bit. We handle the case where the mask is wider with Z_POSSHIFT. But I believe in the case of neon, we'd have all the bits being 1, so we'd iterate 4 times in this loop? Or maybe I missed something else?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's still WIP, I have some local fixes for this that's why it has not been merged yet.