diff --git a/benchmarks/src/find_and_count.cpp b/benchmarks/src/find_and_count.cpp index 0b7561d4a9..7b205aee6a 100644 --- a/benchmarks/src/find_and_count.cpp +++ b/benchmarks/src/find_and_count.cpp @@ -42,31 +42,12 @@ BENCHMARK(bm); BENCHMARK(bm); BENCHMARK(bm); -BENCHMARK(bm); BENCHMARK(bm); BENCHMARK(bm); -BENCHMARK(bm); BENCHMARK(bm); BENCHMARK(bm); -BENCHMARK(bm); BENCHMARK(bm); -BENCHMARK(bm); -BENCHMARK(bm); -BENCHMARK(bm); - -BENCHMARK(bm); -BENCHMARK(bm); -BENCHMARK(bm); - -BENCHMARK(bm); -BENCHMARK(bm); -BENCHMARK(bm); - -BENCHMARK(bm); -BENCHMARK(bm); -BENCHMARK(bm); - BENCHMARK_MAIN(); diff --git a/stl/inc/xutility b/stl/inc/xutility index c946f9379d..260c44f211 100644 --- a/stl/inc/xutility +++ b/stl/inc/xutility @@ -90,11 +90,6 @@ const void* __stdcall __std_find_trivial_2(const void* _First, const void* _Last const void* __stdcall __std_find_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept; const void* __stdcall __std_find_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept; -const void* __stdcall __std_find_trivial_unsized_1(const void* _First, uint8_t _Val) noexcept; -const void* __stdcall __std_find_trivial_unsized_2(const void* _First, uint16_t _Val) noexcept; -const void* __stdcall __std_find_trivial_unsized_4(const void* _First, uint32_t _Val) noexcept; -const void* __stdcall __std_find_trivial_unsized_8(const void* _First, uint64_t _Val) noexcept; - const void* __stdcall __std_min_element_1(const void* _First, const void* _Last, bool _Signed) noexcept; const void* __stdcall __std_min_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; const void* __stdcall __std_min_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; @@ -170,27 +165,6 @@ _Ty* __std_find_trivial(_Ty* const _First, _Ty* const _Last, const _TVal _Val) n } } -template -_Ty* __std_find_trivial_unsized(_Ty* const _First, const _TVal _Val) noexcept { - if constexpr (is_pointer_v<_TVal> || is_null_pointer_v<_TVal>) { - return _STD __std_find_trivial_unsized(_First, reinterpret_cast(_Val)); - } else if constexpr (sizeof(_Ty) == 1) { - return const_cast<_Ty*>( - static_cast(::__std_find_trivial_unsized_1(_First, static_cast(_Val)))); - } else if constexpr (sizeof(_Ty) == 2) { - return const_cast<_Ty*>( - static_cast(::__std_find_trivial_unsized_2(_First, static_cast(_Val)))); - } else if constexpr (sizeof(_Ty) == 4) { - return const_cast<_Ty*>( - static_cast(::__std_find_trivial_unsized_4(_First, static_cast(_Val)))); - } else if constexpr (sizeof(_Ty) == 8) { - return const_cast<_Ty*>( - static_cast(::__std_find_trivial_unsized_8(_First, static_cast(_Val)))); - } else { - static_assert(_Always_false<_Ty>, "Unexpected size"); - } -} - template _Ty* __std_min_element(_Ty* const _First, _Ty* const _Last) noexcept { constexpr bool _Signed = is_signed_v<_Ty>; @@ -5975,8 +5949,14 @@ namespace ranges { template _Se, class _Ty, class _Pj = identity> requires indirect_binary_predicate, const _Ty*> _NODISCARD constexpr _It _Find_unchecked(_It _First, const _Se _Last, const _Ty& _Val, _Pj _Proj = {}) { - constexpr bool _Is_sized = sized_sentinel_for<_Se, _It>; - if constexpr (_Vector_alg_in_find_is_safe<_It, _Ty> && _Sized_or_unreachable_sentinel_for<_Se, _It> + // TRANSITION, DevCom-1614562: not trying wmemchr + // Only single-byte elements are suitable for unsized optimization + constexpr bool _Single_byte_elements = sizeof(_Iter_value_t<_It>) == 1; + constexpr bool _Is_sized = sized_sentinel_for<_Se, _It>; + + if constexpr (_Vector_alg_in_find_is_safe<_It, _Ty> + && (_Single_byte_elements ? _Is_sized || same_as<_Se, unreachable_sentinel_t> + : _Is_sized && _USE_STD_VECTOR_ALGORITHMS) && same_as<_Pj, identity>) { if (!_STD is_constant_evaluated()) { if (!_STD _Could_compare_equal_to_value_type<_It>(_Val)) { @@ -5988,27 +5968,19 @@ namespace ranges { } } - using _Ptr_t = remove_reference_t<_Iter_ref_t<_It>>*; -#if _USE_STD_VECTOR_ALGORITHMS + using _Ptr_t = remove_reference_t<_Iter_ref_t<_It>>*; const auto _First_ptr = _STD _To_address(_First); _Ptr_t _Result; - +#if _USE_STD_VECTOR_ALGORITHMS if constexpr (_Is_sized) { + // When _Is_sized && _Single_byte_elements, prefer this over memchr() for performance const auto _Last_ptr = _First_ptr + (_Last - _First); - - _Result = _STD __std_find_trivial(_First_ptr, _Last_ptr, _Val); - } else { - _Result = _STD __std_find_trivial_unsized(_First_ptr, _Val); - } - - if constexpr (is_pointer_v<_It>) { - return _Result; - } else { - return _RANGES next(_STD move(_First), _Result - _First_ptr); - } -#else // ^^^ _USE_STD_VECTOR_ALGORITHMS / !_USE_STD_VECTOR_ALGORITHMS vvv - if constexpr (sizeof(_Iter_value_t<_It>) == 1) { + _Result = _STD __std_find_trivial(_First_ptr, _Last_ptr, _Val); + } else +#endif // ^^^ _USE_STD_VECTOR_ALGORITHMS ^^^ + { + _STL_INTERNAL_STATIC_ASSERT(_Single_byte_elements); size_t _Count; if constexpr (_Is_sized) { _Count = static_cast(_Last - _First); @@ -6016,21 +5988,20 @@ namespace ranges { _Count = SIZE_MAX; } - const auto _First_ptr = _STD to_address(_First); - const auto _Result = - static_cast<_Ptr_t>(_CSTD memchr(_First_ptr, static_cast(_Val), _Count)); - if (_Result) { - if constexpr (is_pointer_v<_It>) { - return _Result; - } else { - return _RANGES next(_STD move(_First), _Result - _First_ptr); + _Result = static_cast<_Ptr_t>(_CSTD memchr(_First_ptr, static_cast(_Val), _Count)); + + if constexpr (_Is_sized) { + if (_Result == nullptr) { + return _RANGES next(_STD move(_First), _Last); } - } else { - return _RANGES next(_STD move(_First), _Last); } } - // TRANSITION, DevCom-1614562: not trying wmemchr -#endif // ^^^ !_USE_STD_VECTOR_ALGORITHMS ^^^ + + if constexpr (is_pointer_v<_It>) { + return _Result; + } else { + return _RANGES next(_STD move(_First), _Result - _First_ptr); + } } } diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index bf9de5a308..d1e2b654e4 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -1837,79 +1837,9 @@ namespace { #endif // !_M_ARM64EC }; - // The below functions have exactly the same signature as the extern "C" functions, up to calling convention. - // This makes sure the template specialization is fused with the extern "C" function. - // In optimized builds it avoids an extra call, as this function is too large to inline. - - template - const void* __stdcall __std_find_trivial_unsized_impl(const void* _First, const _Ty _Val) noexcept { -#ifndef _M_ARM64EC - if ((reinterpret_cast(_First) & (sizeof(_Ty) - 1)) != 0) { - // _First isn't aligned to sizeof(_Ty), so we need to use the scalar fallback below. - // This can happen with 8-byte elements on x86's 4-aligned stack. It can also happen with packed structs. - } else if (_Use_avx2()) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 - - // We read by vector-sized pieces, and we align pointers to vector-sized boundary. - // From start partial piece we mask out matches that don't belong to the range. - // This makes sure we never cross page boundary, thus we read 'as if' sequentially. - constexpr size_t _Vector_pad_mask = 0x1F; - constexpr unsigned int _Full_mask = 0xFFFF'FFFF; - - const __m256i _Comparand = _Traits::_Set_avx(_Val); - const intptr_t _Pad_start = reinterpret_cast(_First) & _Vector_pad_mask; - const unsigned int _Mask = _Full_mask << _Pad_start; - _Advance_bytes(_First, -_Pad_start); - - __m256i _Data = _mm256_load_si256(static_cast(_First)); - unsigned int _Bingo = static_cast(_mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand))); - - _Bingo &= _Mask; - - for (;;) { - if (_Bingo != 0) { - unsigned long _Offset = _tzcnt_u32(_Bingo); - _Advance_bytes(_First, _Offset); - return _First; - } - - _Advance_bytes(_First, 32); - - _Data = _mm256_load_si256(static_cast(_First)); - _Bingo = static_cast(_mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand))); - } - } else if (_Traits::_Sse_available()) { - // We read by vector-sized pieces, and we align pointers to vector-sized boundary. - // From start partial piece we mask out matches that don't belong to the range. - // This makes sure we never cross page boundary, thus we read 'as if' sequentially. - constexpr size_t _Vector_pad_mask = 0xF; - constexpr unsigned int _Full_mask = 0xFFFF; - - const __m128i _Comparand = _Traits::_Set_sse(_Val); - const intptr_t _Pad_start = reinterpret_cast(_First) & _Vector_pad_mask; - const unsigned int _Mask = _Full_mask << _Pad_start; - _Advance_bytes(_First, -_Pad_start); - - __m128i _Data = _mm_load_si128(static_cast(_First)); - unsigned int _Bingo = static_cast(_mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand))); - - _Bingo &= _Mask; - - for (;;) { - if (_Bingo != 0) { - unsigned long _Offset; - _BitScanForward(&_Offset, _Bingo); // lgtm [cpp/conditionallyuninitializedvariable] - _Advance_bytes(_First, _Offset); - return _First; - } - - _Advance_bytes(_First, 16); - - _Data = _mm_load_si128(static_cast(_First)); - _Bingo = static_cast(_mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand))); - } - } -#endif // !_M_ARM64EC + // TRANSITION, ABI: used only in functions preserved for binary compatibility + template + const void* __std_find_trivial_unsized_impl(const void* const _First, const _Ty _Val) noexcept { auto _Ptr = static_cast(_First); while (*_Ptr != _Val) { ++_Ptr; @@ -1917,6 +1847,10 @@ namespace { return _Ptr; } + // The below functions have exactly the same signature as the extern "C" functions, up to calling convention. + // This makes sure the template specialization can be fused with the extern "C" function. + // In optimized builds it avoids an extra call, as these functions are too large to inline. + template const void* __stdcall __std_find_trivial_impl(const void* _First, const void* _Last, _Ty _Val) noexcept { #ifndef _M_ARM64EC @@ -2079,20 +2013,25 @@ namespace { extern "C" { +// TRANSITION, ABI: preserved for binary compatibility const void* __stdcall __std_find_trivial_unsized_1(const void* const _First, const uint8_t _Val) noexcept { - return __std_find_trivial_unsized_impl<_Find_traits_1>(_First, _Val); + return memchr(_First, _Val, SIZE_MAX); } +// TRANSITION, ABI: preserved for binary compatibility const void* __stdcall __std_find_trivial_unsized_2(const void* const _First, const uint16_t _Val) noexcept { - return __std_find_trivial_unsized_impl<_Find_traits_2>(_First, _Val); + // TRANSITION, DevCom-1614562: not trying wmemchr + return __std_find_trivial_unsized_impl(_First, _Val); } +// TRANSITION, ABI: preserved for binary compatibility const void* __stdcall __std_find_trivial_unsized_4(const void* const _First, const uint32_t _Val) noexcept { - return __std_find_trivial_unsized_impl<_Find_traits_4>(_First, _Val); + return __std_find_trivial_unsized_impl(_First, _Val); } +// TRANSITION, ABI: preserved for binary compatibility const void* __stdcall __std_find_trivial_unsized_8(const void* const _First, const uint64_t _Val) noexcept { - return __std_find_trivial_unsized_impl<_Find_traits_8>(_First, _Val); + return __std_find_trivial_unsized_impl(_First, _Val); } const void* __stdcall __std_find_trivial_1(