Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove attempt on unsized find vectorization... #4486

Merged
merged 13 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 0 additions & 19 deletions benchmarks/src/find_and_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,31 +42,12 @@ BENCHMARK(bm<uint8_t, 8021, 3056, Op::FindUnsized>);
BENCHMARK(bm<uint8_t, 8021, 3056, Op::Count>);

BENCHMARK(bm<uint16_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint16_t, 8021, 3056, Op::FindUnsized>);
BENCHMARK(bm<uint16_t, 8021, 3056, Op::Count>);

BENCHMARK(bm<uint32_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint32_t, 8021, 3056, Op::FindUnsized>);
BENCHMARK(bm<uint32_t, 8021, 3056, Op::Count>);

BENCHMARK(bm<uint64_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<uint64_t, 8021, 3056, Op::FindUnsized>);
BENCHMARK(bm<uint64_t, 8021, 3056, Op::Count>);

BENCHMARK(bm<int8_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<int8_t, 8021, 3056, Op::FindUnsized>);
BENCHMARK(bm<int8_t, 8021, 3056, Op::Count>);

BENCHMARK(bm<int16_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<int16_t, 8021, 3056, Op::FindUnsized>);
BENCHMARK(bm<int16_t, 8021, 3056, Op::Count>);

BENCHMARK(bm<int32_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<int32_t, 8021, 3056, Op::FindUnsized>);
BENCHMARK(bm<int32_t, 8021, 3056, Op::Count>);

BENCHMARK(bm<int64_t, 8021, 3056, Op::FindSized>);
BENCHMARK(bm<int64_t, 8021, 3056, Op::FindUnsized>);
BENCHMARK(bm<int64_t, 8021, 3056, Op::Count>);

BENCHMARK_MAIN();
85 changes: 28 additions & 57 deletions stl/inc/xutility
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,6 @@ const void* __stdcall __std_find_trivial_2(const void* _First, const void* _Last
const void* __stdcall __std_find_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept;
const void* __stdcall __std_find_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept;

const void* __stdcall __std_find_trivial_unsized_1(const void* _First, uint8_t _Val) noexcept;
const void* __stdcall __std_find_trivial_unsized_2(const void* _First, uint16_t _Val) noexcept;
const void* __stdcall __std_find_trivial_unsized_4(const void* _First, uint32_t _Val) noexcept;
const void* __stdcall __std_find_trivial_unsized_8(const void* _First, uint64_t _Val) noexcept;

const void* __stdcall __std_min_element_1(const void* _First, const void* _Last, bool _Signed) noexcept;
const void* __stdcall __std_min_element_2(const void* _First, const void* _Last, bool _Signed) noexcept;
const void* __stdcall __std_min_element_4(const void* _First, const void* _Last, bool _Signed) noexcept;
Expand Down Expand Up @@ -170,27 +165,6 @@ _Ty* __std_find_trivial(_Ty* const _First, _Ty* const _Last, const _TVal _Val) n
}
}

template <class _Ty, class _TVal>
_Ty* __std_find_trivial_unsized(_Ty* const _First, const _TVal _Val) noexcept {
if constexpr (is_pointer_v<_TVal> || is_null_pointer_v<_TVal>) {
return _STD __std_find_trivial_unsized(_First, reinterpret_cast<uintptr_t>(_Val));
} else if constexpr (sizeof(_Ty) == 1) {
return const_cast<_Ty*>(
static_cast<const _Ty*>(::__std_find_trivial_unsized_1(_First, static_cast<uint8_t>(_Val))));
} else if constexpr (sizeof(_Ty) == 2) {
return const_cast<_Ty*>(
static_cast<const _Ty*>(::__std_find_trivial_unsized_2(_First, static_cast<uint16_t>(_Val))));
} else if constexpr (sizeof(_Ty) == 4) {
return const_cast<_Ty*>(
static_cast<const _Ty*>(::__std_find_trivial_unsized_4(_First, static_cast<uint32_t>(_Val))));
} else if constexpr (sizeof(_Ty) == 8) {
return const_cast<_Ty*>(
static_cast<const _Ty*>(::__std_find_trivial_unsized_8(_First, static_cast<uint64_t>(_Val))));
} else {
static_assert(_Always_false<_Ty>, "Unexpected size");
}
}

template <class _Ty>
_Ty* __std_min_element(_Ty* const _First, _Ty* const _Last) noexcept {
constexpr bool _Signed = is_signed_v<_Ty>;
Expand Down Expand Up @@ -5975,8 +5949,14 @@ namespace ranges {
template <input_iterator _It, sentinel_for<_It> _Se, class _Ty, class _Pj = identity>
requires indirect_binary_predicate<ranges::equal_to, projected<_It, _Pj>, const _Ty*>
_NODISCARD constexpr _It _Find_unchecked(_It _First, const _Se _Last, const _Ty& _Val, _Pj _Proj = {}) {
constexpr bool _Is_sized = sized_sentinel_for<_Se, _It>;
if constexpr (_Vector_alg_in_find_is_safe<_It, _Ty> && _Sized_or_unreachable_sentinel_for<_Se, _It>
// TRANSITION, DevCom-1614562: not trying wmemchr
// Only one-byte elements are suitable for unsized optimization
constexpr bool _Single_byte_elements = sizeof(_Iter_value_t<_It>) == 1;
constexpr bool _Is_sized = sized_sentinel_for<_Se, _It>;

if constexpr (_Vector_alg_in_find_is_safe<_It, _Ty>
&& (_Single_byte_elements ? _Is_sized || same_as<_Se, unreachable_sentinel_t>
: _Is_sized && _USE_STD_VECTOR_ALGORITHMS)
&& same_as<_Pj, identity>) {
if (!_STD is_constant_evaluated()) {
if (!_STD _Could_compare_equal_to_value_type<_It>(_Val)) {
Expand All @@ -5988,49 +5968,40 @@ namespace ranges {
}
}

using _Ptr_t = remove_reference_t<_Iter_ref_t<_It>>*;
#if _USE_STD_VECTOR_ALGORITHMS
using _Ptr_t = remove_reference_t<_Iter_ref_t<_It>>*;
const auto _First_ptr = _STD _To_address(_First);

_Ptr_t _Result;

if constexpr (_Is_sized) {
#if _USE_STD_VECTOR_ALGORITHMS
if constexpr (!_Single_byte_elements) {
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
_STL_INTERNAL_STATIC_ASSERT(_Is_sized);
const auto _Last_ptr = _First_ptr + (_Last - _First);

_Result = _STD __std_find_trivial(_First_ptr, _Last_ptr, _Val);
} else {
_Result = _STD __std_find_trivial_unsized(_First_ptr, _Val);
}

if constexpr (is_pointer_v<_It>) {
return _Result;
} else {
return _RANGES next(_STD move(_First), _Result - _First_ptr);
}
#else // ^^^ _USE_STD_VECTOR_ALGORITHMS / !_USE_STD_VECTOR_ALGORITHMS vvv
if constexpr (sizeof(_Iter_value_t<_It>) == 1) {
_Result = _STD __std_find_trivial(_First_ptr, _Last_ptr, _Val);
} else
#endif // ^^^ _USE_STD_VECTOR_ALGORITHMS ^^^
{
_STL_INTERNAL_STATIC_ASSERT(_Single_byte_elements);
size_t _Count;
if constexpr (_Is_sized) {
_Count = static_cast<size_t>(_Last - _First);
} else {
_Count = SIZE_MAX;
}

const auto _First_ptr = _STD to_address(_First);
const auto _Result =
static_cast<_Ptr_t>(_CSTD memchr(_First_ptr, static_cast<unsigned char>(_Val), _Count));
if (_Result) {
if constexpr (is_pointer_v<_It>) {
return _Result;
} else {
return _RANGES next(_STD move(_First), _Result - _First_ptr);
_Result = static_cast<_Ptr_t>(_CSTD memchr(_First_ptr, static_cast<unsigned char>(_Val), _Count));

if constexpr (_Is_sized) {
if (_Result == nullptr) {
return _RANGES next(_STD move(_First), _Last);
}
} else {
return _RANGES next(_STD move(_First), _Last);
}
}
// TRANSITION, DevCom-1614562: not trying wmemchr
#endif // ^^^ !_USE_STD_VECTOR_ALGORITHMS ^^^

if constexpr (is_pointer_v<_It>) {
return _Result;
} else {
return _RANGES next(_STD move(_First), _Result - _First_ptr);
}
}
}

Expand Down
93 changes: 16 additions & 77 deletions stl/src/vector_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1837,86 +1837,20 @@ namespace {
#endif // !_M_ARM64EC
};

// The below functions have exactly the same signature as the extern "C" functions, up to calling convention.
// This makes sure the template specialization is fused with the extern "C" function.
// In optimized builds it avoids an extra call, as this function is too large to inline.

template <class _Traits, class _Ty>
const void* __stdcall __std_find_trivial_unsized_impl(const void* _First, const _Ty _Val) noexcept {
#ifndef _M_ARM64EC
if ((reinterpret_cast<uintptr_t>(_First) & (sizeof(_Ty) - 1)) != 0) {
// _First isn't aligned to sizeof(_Ty), so we need to use the scalar fallback below.
// This can happen with 8-byte elements on x86's 4-aligned stack. It can also happen with packed structs.
} else if (_Use_avx2()) {
_Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414

// We read by vector-sized pieces, and we align pointers to vector-sized boundary.
// From start partial piece we mask out matches that don't belong to the range.
// This makes sure we never cross page boundary, thus we read 'as if' sequentially.
constexpr size_t _Vector_pad_mask = 0x1F;
constexpr unsigned int _Full_mask = 0xFFFF'FFFF;

const __m256i _Comparand = _Traits::_Set_avx(_Val);
const intptr_t _Pad_start = reinterpret_cast<intptr_t>(_First) & _Vector_pad_mask;
const unsigned int _Mask = _Full_mask << _Pad_start;
_Advance_bytes(_First, -_Pad_start);

__m256i _Data = _mm256_load_si256(static_cast<const __m256i*>(_First));
unsigned int _Bingo = static_cast<unsigned int>(_mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand)));

_Bingo &= _Mask;

for (;;) {
if (_Bingo != 0) {
unsigned long _Offset = _tzcnt_u32(_Bingo);
_Advance_bytes(_First, _Offset);
return _First;
}

_Advance_bytes(_First, 32);

_Data = _mm256_load_si256(static_cast<const __m256i*>(_First));
_Bingo = static_cast<unsigned int>(_mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand)));
}
} else if (_Traits::_Sse_available()) {
// We read by vector-sized pieces, and we align pointers to vector-sized boundary.
// From start partial piece we mask out matches that don't belong to the range.
// This makes sure we never cross page boundary, thus we read 'as if' sequentially.
constexpr size_t _Vector_pad_mask = 0xF;
constexpr unsigned int _Full_mask = 0xFFFF;

const __m128i _Comparand = _Traits::_Set_sse(_Val);
const intptr_t _Pad_start = reinterpret_cast<intptr_t>(_First) & _Vector_pad_mask;
const unsigned int _Mask = _Full_mask << _Pad_start;
_Advance_bytes(_First, -_Pad_start);

__m128i _Data = _mm_load_si128(static_cast<const __m128i*>(_First));
unsigned int _Bingo = static_cast<unsigned int>(_mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand)));

_Bingo &= _Mask;

for (;;) {
if (_Bingo != 0) {
unsigned long _Offset;
_BitScanForward(&_Offset, _Bingo); // lgtm [cpp/conditionallyuninitializedvariable]
_Advance_bytes(_First, _Offset);
return _First;
}

_Advance_bytes(_First, 16);

_Data = _mm_load_si128(static_cast<const __m128i*>(_First));
_Bingo = static_cast<unsigned int>(_mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand)));
}
}
#endif // !_M_ARM64EC
// TRANSITION, ABI: used only in functions preserved for binary compatibility
template <class _Ty>
const void* __std_find_trivial_unsized_impl(const void* const _First, const _Ty _Val) noexcept {
auto _Ptr = static_cast<const _Ty*>(_First);
while (*_Ptr != _Val) {
++_Ptr;
}
return _Ptr;
}

// The below functions have exactly the same signature as the extern "C" functions, up to calling convention.
// This makes sure the template specialization can be fused with the extern "C" function.
// In optimized builds it avoids an extra call, as these functions are too large to inline.

template <class _Traits, class _Ty>
const void* __stdcall __std_find_trivial_impl(const void* _First, const void* _Last, _Ty _Val) noexcept {
#ifndef _M_ARM64EC
Expand Down Expand Up @@ -2079,20 +2013,25 @@ namespace {

extern "C" {

// TRANSITION, ABI: preserved for binary compatibility
const void* __stdcall __std_find_trivial_unsized_1(const void* const _First, const uint8_t _Val) noexcept {
return __std_find_trivial_unsized_impl<_Find_traits_1>(_First, _Val);
return memchr(_First, _Val, SIZE_MAX);
}

// TRANSITION, ABI: preserved for binary compatibility
const void* __stdcall __std_find_trivial_unsized_2(const void* const _First, const uint16_t _Val) noexcept {
return __std_find_trivial_unsized_impl<_Find_traits_2>(_First, _Val);
// TRANSITION, DevCom-1614562: not trying wmemchr
return __std_find_trivial_unsized_impl(_First, _Val);
}

// TRANSITION, ABI: preserved for binary compatibility
const void* __stdcall __std_find_trivial_unsized_4(const void* const _First, const uint32_t _Val) noexcept {
return __std_find_trivial_unsized_impl<_Find_traits_4>(_First, _Val);
return __std_find_trivial_unsized_impl(_First, _Val);
}

// TRANSITION, ABI: preserved for binary compatibility
const void* __stdcall __std_find_trivial_unsized_8(const void* const _First, const uint64_t _Val) noexcept {
return __std_find_trivial_unsized_impl<_Find_traits_8>(_First, _Val);
return __std_find_trivial_unsized_impl(_First, _Val);
}

const void* __stdcall __std_find_trivial_1(
Expand Down