From c19aa79e7013819150a5a1e695259a8471508c5c Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 6 Apr 2023 17:27:01 +0300 Subject: [PATCH 1/2] Add vzeroupper, so that is is there even in /Od Fix #3601 --- stl/src/vector_algorithms.cpp | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index a38d93c281..ae3e6d778f 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -66,6 +66,14 @@ namespace { void _Advance_bytes(const void*& _Target, ptrdiff_t _Offset) noexcept { _Target = static_cast(_Target) + _Offset; } + + // TRANSITION, DevCom-10331414 + struct _Zeroupper_on_exit { + ~_Zeroupper_on_exit() { + _mm256_zeroupper(); + } + }; + } // unnamed namespace extern "C" { @@ -89,6 +97,8 @@ __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias( _Advance_bytes(_First1, 32); _Advance_bytes(_First2, 32); } while (_First1 != _Stop_at); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } constexpr size_t _Mask_16 = ~((static_cast(1) << 4) - 1); @@ -174,6 +184,8 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_1(void* _Firs _mm256_storeu_si256(static_cast<__m256i*>(_Last), _Left_reversed); _Advance_bytes(_First, 32); } while (_First != _Stop_at); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } if (_Byte_length(_First, _Last) >= 32 && _Use_sse42()) { @@ -214,6 +226,8 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_2(void* _Firs _mm256_storeu_si256(static_cast<__m256i*>(_Last), _Left_reversed); _Advance_bytes(_First, 32); } while (_First != _Stop_at); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } if (_Byte_length(_First, _Last) >= 32 && _Use_sse42()) { @@ -250,6 +264,8 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_4(void* _Firs _mm256_storeu_si256(static_cast<__m256i*>(_Last), _Left_reversed); _Advance_bytes(_First, 32); } while (_First != _Stop_at); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } if (_Byte_length(_First, _Last) >= 32 && _Use_sse2()) { @@ -284,6 +300,8 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_8(void* _Firs _mm256_storeu_si256(static_cast<__m256i*>(_Last), _Left_reversed); _Advance_bytes(_First, 32); } while (_First != _Stop_at); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } if (_Byte_length(_First, _Last) >= 32 && _Use_sse2()) { @@ -320,6 +338,8 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_1( _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); _Advance_bytes(_Dest, 32); } while (_Dest != _Stop_at); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } if (_Byte_length(_First, _Last) >= 16 && _Use_sse42()) { @@ -355,6 +375,8 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_2( _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); _Advance_bytes(_Dest, 32); } while (_Dest != _Stop_at); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } if (_Byte_length(_First, _Last) >= 16 && _Use_sse42()) { @@ -387,6 +409,8 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_4( _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); _Advance_bytes(_Dest, 32); } while (_Dest != _Stop_at); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } if (_Byte_length(_First, _Last) >= 16 && _Use_sse2()) { @@ -417,6 +441,8 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); _Advance_bytes(_Dest, 32); } while (_Dest != _Stop_at); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } if (_Byte_length(_First, _Last) >= 16 && _Use_sse2()) { @@ -1200,6 +1226,8 @@ namespace { template const void* __stdcall __std_find_trivial_unsized(const void* _First, const _Ty _Val) noexcept { if (_Use_avx2()) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + // We read by vector-sized pieces, and we align pointers to vector-sized boundary. // From start partial piece we mask out matches that don't belong to the range. // This makes sure we never cross page boundary, thus we read 'as if' sequentially. @@ -1282,6 +1310,8 @@ namespace { const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; if (_Avx_size != 0 && _Use_avx2()) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + const __m256i _Comparand = _Traits::_Set_avx(_Val); const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Avx_size); @@ -1341,6 +1371,8 @@ namespace { _Advance_bytes(_First, 32); } while (_First != _Stop_at); _Size_bytes &= 0x1F; + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; From 84bdeb8cf3761709af086c4f8acc94b0f738bb67 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 6 Apr 2023 11:32:11 -0700 Subject: [PATCH 2/2] Make `_Zeroupper_on_exit` nodiscard and noncopyable. --- stl/src/vector_algorithms.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index ae3e6d778f..ea24f5ff4b 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -68,7 +68,12 @@ namespace { } // TRANSITION, DevCom-10331414 - struct _Zeroupper_on_exit { + struct [[nodiscard]] _Zeroupper_on_exit { + _Zeroupper_on_exit() = default; + + _Zeroupper_on_exit(const _Zeroupper_on_exit&) = delete; + _Zeroupper_on_exit& operator=(const _Zeroupper_on_exit&) = delete; + ~_Zeroupper_on_exit() { _mm256_zeroupper(); }