Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/refactor noodle masked load (WIP) #216

Draft
wants to merge 10 commits into
base: develop
Choose a base branch
from
269 changes: 94 additions & 175 deletions src/hwlm/noodle_engine_simd.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2020-2021, VectorCamp PC
* Copyright (c) 2020-2023, VectorCamp PC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
Expand Down Expand Up @@ -34,7 +34,7 @@

static really_really_inline
hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
Z_TYPE z, size_t len, const struct cb_info *cbi) {
Z_TYPE z, size_t len, const struct cb_info *cbi) {
while (unlikely(z)) {
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I understand, this clear a single bit. We handle the case where the mask is wider with Z_POSSHIFT. But I believe in the case of neon, we'd have all the bits being 1, so we'd iterate 4 times in this loop? Or maybe I missed something else?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's still WIP, I have some local fixes for this that's why it has not been merged yet.

size_t matchPos = d - buf + pos;
Expand All @@ -47,9 +47,10 @@ hwlm_error_t single_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,

static really_really_inline
hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
Z_TYPE z, size_t len, const struct cb_info *cbi) {
Z_TYPE z, size_t len, const struct cb_info *cbi) {
while (unlikely(z)) {
Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z) >> Z_POSSHIFT;
DEBUG_PRINTF("pos %u\n", pos);
size_t matchPos = d - buf + pos - 1;
DEBUG_PRINTF("match pos %zu\n", matchPos);
hwlmcb_rv_t rv = final(n, buf, len, true, cbi, matchPos);
Expand All @@ -58,167 +59,67 @@ hwlm_error_t double_zscan(const struct noodTable *n,const u8 *d, const u8 *buf,
return HWLM_SUCCESS;
}


template<uint16_t S>
static really_inline
hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf,
SuperVector<S> caseMask, SuperVector<S> mask1,
const struct cb_info *cbi, size_t len, size_t start,
size_t end) {
const u8 *d = buf + start;
DEBUG_PRINTF("start %zu end %zu\n", start, end);
const size_t l = end - start;
DEBUG_PRINTF("l = %ld\n", l);
//assert(l <= 64);
if (!l) {
return HWLM_SUCCESS;
}

SuperVector<S> v = SuperVector<S>::Zeroes();
memcpy(&v.u, d, l);

typename SuperVector<S>::comparemask_type mask =
SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
v = v & caseMask;
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
z = SuperVector<S>::iteration_mask(z);

return single_zscan(n, d, buf, z, len, cbi);
}

// The short scan routine. It is used both to scan data up to an
// alignment boundary if needed and to finish off data that the aligned scan
// function can't handle (due to small/unaligned chunk at end)
template<uint16_t S>
static really_inline
hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf,
SuperVector<S> caseMask, SuperVector<S> mask1,
const struct cb_info *cbi, size_t len, size_t offset,
size_t start,
size_t end) {
const u8 *d = buf + offset;
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
const size_t l = end - start;
DEBUG_PRINTF("l = %ld\n", l);
assert(l <= 64);
if (!l) {
return HWLM_SUCCESS;
}
size_t buf_off = start - offset;
typename SuperVector<S>::comparemask_type mask =
SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width())
<< (buf_off * SuperVector<S>::mask_width());
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(v);
z = SuperVector<S>::iteration_mask(z);

return single_zscan(n, d, buf, z, len, cbi);
}

template<uint16_t S>
static really_inline
hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf,
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
const struct cb_info *cbi, size_t len, size_t start, size_t end) {
const u8 *d = buf + start;
DEBUG_PRINTF("start %zu end %zu\n", start, end);
const size_t l = end - start;
assert(l <= S);
if (!l) {
return HWLM_SUCCESS;
}
SuperVector<S> v = SuperVector<S>::Zeroes();
memcpy(&v.u, d, l);
v = v & caseMask;

typename SuperVector<S>::comparemask_type mask =
DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
typename SuperVector<S>::comparemask_type z =
mask & (z1 << (SuperVector<S>::mask_width())) & z2;
z = SuperVector<S>::iteration_mask(z);

return double_zscan(n, d, buf, z, len, cbi);
}

template<uint16_t S>
static really_inline
hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf,
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
const struct cb_info *cbi, size_t len, size_t offset, size_t start, size_t end) {
const u8 *d = buf + offset;
DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset);
const size_t l = end - start;
assert(l <= S);
if (!l) {
return HWLM_SUCCESS;
}
SuperVector<S> v = SuperVector<S>::loadu(d) & caseMask;
size_t buf_off = start - offset;
typename SuperVector<S>::comparemask_type mask =
DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width())
<< (buf_off * SuperVector<S>::mask_width());
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
typename SuperVector<S>::comparemask_type z =
mask & (z1 << SuperVector<S>::mask_width()) & z2;
z = SuperVector<S>::iteration_mask(z);

return double_zscan(n, d, buf, z, len, cbi);
}

template <uint16_t S>
static really_inline
hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf,
size_t len, size_t offset,
SuperVector<S> caseMask, SuperVector<S> mask1,
const struct cb_info *cbi) {
size_t start = offset + n->msk_len - 1;
size_t end = len;

const u8 *d = buf + start;
const u8 *e = buf + end;
DEBUG_PRINTF("start %p end %p \n", d, e);
assert(d < e);
if (e - d < S) {
return scanSingleShort(n, buf, caseMask, mask1, cbi, len, start, end);
}
if (d + S <= e) {
// peel off first part to cacheline boundary
const u8 *d1 = ROUNDUP_PTR(d, S);
DEBUG_PRINTF("until aligned %p \n", d1);
if (scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) {
return HWLM_TERMINATED;
}
d = d1;
const u8 *buf_end = buf + len;
assert(d < buf_end);

DEBUG_PRINTF("noodle %p start %zu len %zu\n", buf, start, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
DEBUG_PRINTF("start %p end %p \n", d, buf_end);

__builtin_prefetch(d + 16*64);
assert(d < buf_end);
if (d + S <= buf_end) {
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
const u8 *d1 = ROUNDUP_PTR(d, S);
DEBUG_PRINTF("d1 - d: %ld \n", d1 - d);
size_t l = d1 - d;
SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask;
typename SuperVector<S>::comparemask_type mask = SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(chars);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you forgot the iteration_mask(z); here ? I'm not sure what's its purpose, but it was there in the previous code, and is also there in the double scan path.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, this is WIP, there is uncommitted code that I need to fix. iteration_mask is a way to reproduce the movemask functionality on Intel, it performs a different way in each architecture.


size_t loops = (end - (d - buf)) / S;
DEBUG_PRINTF("loops %ld \n", loops);
hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
RETURN_IF_TERMINATED(rv);
d = d1;
}

for (size_t i = 0; i < loops; i++, d+= S) {
while(d + S <= buf_end) {
__builtin_prefetch(d + 16*64);
DEBUG_PRINTF("d %p \n", d);
const u8 *base = ROUNDUP_PTR(d, 64);
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(base + 256);

SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
typename SuperVector<S>::comparemask_type z = mask1.eqmask(v);
z = SuperVector<S>::iteration_mask(z);

hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);
RETURN_IF_TERMINATED(rv);
d += S;
}
}

DEBUG_PRINTF("d %p e %p \n", d, e);
DEBUG_PRINTF("d %p e %p \n", d, buf_end);
// finish off tail
size_t s2End = ROUNDDOWN_PTR(e, S) - buf;
if (s2End == end) {
return HWLM_SUCCESS;

if (d != buf_end) {
SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask;
size_t l = buf_end - d;
typename SuperVector<S>::comparemask_type mask = SINGLE_LOAD_MASK(l * SuperVector<S>::mask_width());
typename SuperVector<S>::comparemask_type z = mask & mask1.eqmask(chars);
hwlm_error_t rv = single_zscan(n, d, buf, z, len, cbi);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing the iteration_mask(z); here too?

RETURN_IF_TERMINATED(rv);
}

return scanSingleUnaligned(n, buf, caseMask, mask1, cbi, len, end - S, s2End, len);
return HWLM_SUCCESS;
}

template <uint16_t S>
Expand All @@ -227,66 +128,84 @@ hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf,
size_t len, size_t offset,
SuperVector<S> caseMask, SuperVector<S> mask1, SuperVector<S> mask2,
const struct cb_info *cbi) {
// we stop scanning for the key-fragment when the rest of the key can't
// possibly fit in the remaining buffer
size_t end = len - n->key_offset + 2;

size_t start = offset + n->msk_len - n->key_offset;

const u8 *d = buf + start;
const u8 *buf_end = buf + end;
assert(d < buf_end);

DEBUG_PRINTF("noodle %p start %zu len %zu\n", buf, start, buf_end - buf);
DEBUG_PRINTF("b %s\n", buf);
DEBUG_PRINTF("start %p end %p \n", d, buf_end);

typename SuperVector<S>::comparemask_type lastz1{0};

const u8 *d = buf + start;
const u8 *e = buf + end;
DEBUG_PRINTF("start %p end %p \n", d, e);
assert(d < e);
if (e - d < S) {
return scanDoubleShort(n, buf, caseMask, mask1, mask2, cbi, len, d - buf, end);
}
if (d + S <= e) {
// peel off first part to cacheline boundary
const u8 *d1 = ROUNDUP_PTR(d, S) + 1;
DEBUG_PRINTF("until aligned %p \n", d1);
if (scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, start, start, d1 - buf) == HWLM_TERMINATED) {
return HWLM_TERMINATED;
}
d = d1 - 1;
__builtin_prefetch(d + 16*64);
assert(d < buf_end);
if (d + S <= buf_end) {
// Reach vector aligned boundaries
DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S));
if (!ISALIGNED_N(d, S)) {
const u8 *d1 = ROUNDUP_PTR(d, S);
size_t l = d1 - d;
SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask;
typename SuperVector<S>::comparemask_type mask = DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars);
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars);
typename SuperVector<S>::comparemask_type z = mask & (z1 << SuperVector<S>::mask_width()) & z2;
lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width());
z = SuperVector<S>::iteration_mask(z);

size_t loops = (end - (d - buf)) / S;
DEBUG_PRINTF("loops %ld \n", loops);
hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
RETURN_IF_TERMINATED(rv);
d = d1;
}

for (size_t i = 0; i < loops; i++, d+= S) {
while(d + S <= buf_end) {
__builtin_prefetch(d + 16*64);
DEBUG_PRINTF("d %p \n", d);
const u8 *base = ROUNDUP_PTR(d, 64);
// On large packet buffers, this prefetch appears to get us about 2%.
__builtin_prefetch(base + 256);

SuperVector<S> v = SuperVector<S>::load(d) & caseMask;
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(v);
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(v);
typename SuperVector<S>::comparemask_type z =
(z1 << SuperVector<S>::mask_width() | lastz1) & z2;
SuperVector<S> chars = SuperVector<S>::load(d) & caseMask;
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars);
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars);
typename SuperVector<S>::comparemask_type z = (z1 << SuperVector<S>::mask_width() | lastz1) & z2;
lastz1 = z1 >> (Z_SHIFT * SuperVector<S>::mask_width());
z = SuperVector<S>::iteration_mask(z);

hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
RETURN_IF_TERMINATED(rv);
}
if (loops == 0) {
d = d1;
d += S;
}
}

DEBUG_PRINTF("d %p e %p \n", d, buf_end);
// finish off tail
size_t s2End = ROUNDDOWN_PTR(e, S) - buf;
if (s2End == end) {
return HWLM_SUCCESS;

if (d != buf_end) {
size_t l = buf_end - d;
SuperVector<S> chars = SuperVector<S>::loadu(d) & caseMask;
typename SuperVector<S>::comparemask_type mask = DOUBLE_LOAD_MASK(l * SuperVector<S>::mask_width());
typename SuperVector<S>::comparemask_type z1 = mask1.eqmask(chars);
typename SuperVector<S>::comparemask_type z2 = mask2.eqmask(chars);
typename SuperVector<S>::comparemask_type z = mask & (z1 << SuperVector<S>::mask_width() | lastz1) & z2;
z = SuperVector<S>::iteration_mask(z);

hwlm_error_t rv = double_zscan(n, d, buf, z, len, cbi);
RETURN_IF_TERMINATED(rv);
}
return scanDoubleUnaligned(n, buf, caseMask, mask1, mask2, cbi, len, end - S, d - buf, end);

return HWLM_SUCCESS;
}

// Single-character specialisation, used when keyLen = 1
static really_inline
hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len,
size_t start, bool noCase, const struct cb_info *cbi) {
/* if (len < VECTORSIZE) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commented code. Shouldn't it be removed?

return scanSingleSlow(n, buf, len, start, noCase, n->key0, cbi);
}*/

if (!ourisalpha(n->key0)) {
noCase = 0; // force noCase off if we don't have an alphabetic char
}
Expand Down