From 13535bbc7969d67ea06204913dcec3dc03d171c9 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sun, 9 Aug 2020 17:49:23 +0100 Subject: [PATCH 1/7] Intrinsicify IndexOfAny(char,char) --- .../src/System/SpanHelpers.Char.cs | 369 ++++++++++-------- 1 file changed, 210 insertions(+), 159 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index 16b680ca53da1b..1bb9edfcc733ea 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -493,202 +493,238 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length) } [MethodImpl(MethodImplOptions.AggressiveOptimization)] - public static unsafe int IndexOfAny(ref char searchSpace, char value0, char value1, int length) + public static unsafe int IndexOfAny(ref char searchStart, char value0, char value1, int length) { Debug.Assert(length >= 0); - fixed (char* pChars = &searchSpace) - { - char* pCh = pChars; - char* pEndCh = pCh + length; + nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)length; - if (Vector.IsHardwareAccelerated && length >= Vector.Count * 2) + if (Sse2.IsSupported) + { + // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) { - // Figure out how many characters to read sequentially until we are vector aligned - // This is equivalent to: - // unaligned = ((int)pCh % Unsafe.SizeOf>()) / elementsPerByte - // length = (Vector.Count - unaligned) % Vector.Count - const int elementsPerByte = sizeof(ushort) / sizeof(byte); - int unaligned = ((int)pCh & (Unsafe.SizeOf>() - 1)) / elementsPerByte; - length = (Vector.Count - unaligned) & (Vector.Count - 1); + // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // We jump forward to the intrinsics at the end of them method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; } - - SequentialScan: - while (length >= 4) + } + else if (Vector.IsHardwareAccelerated) + { + // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) { - length -= 4; + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto VectorCompare; + } + } - if (pCh[0] == value0 || pCh[0] == value1) - goto Found; - if (pCh[1] == value0 || pCh[1] == value1) - goto Found1; - if (pCh[2] == value0 || pCh[2] == value1) - goto Found2; - if (pCh[3] == value0 || pCh[3] == value1) - goto Found3; + int lookUp; + while (lengthToExamine >= 4) + { + ref char current = ref Add(ref searchStart, offset); - pCh += 4; - } + lookUp = current; + if (value0 == lookUp || value1 == lookUp) + goto Found; + lookUp = Unsafe.Add(ref current, 1); + if (value0 == lookUp || value1 == lookUp) + goto Found1; + lookUp = Unsafe.Add(ref current, 2); + if (value0 == lookUp || value1 == lookUp) + goto Found2; + lookUp = Unsafe.Add(ref current, 3); + if (value0 == lookUp || value1 == lookUp) + goto Found3; - while (length > 0) - { - length--; + offset += 4; + lengthToExamine -= 4; + } - if (pCh[0] == value0 || pCh[0] == value1) - goto Found; + while (lengthToExamine > 0) + { + lookUp = Add(ref searchStart, offset); + if (value0 == lookUp || value1 == lookUp) + goto Found; - pCh++; - } + offset += 1; + lengthToExamine -= 1; + } - // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow - // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. - if (Vector.IsHardwareAccelerated && pCh < pEndCh) + NotFound: + return -1; + Found3: + return (int)(offset + 3); + Found2: + return (int)(offset + 2); + Found1: + return (int)(offset + 1); + Found: + return (int)offset; + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) + { + int matches; + if (Avx2.IsSupported) { - // Get the highest multiple of Vector.Count that is within the search space. - // That will be how many times we iterate in the loop below. - // This is equivalent to: length = Vector.Count * ((int)(pEndCh - pCh) / Vector.Count) - length = (int)((pEndCh - pCh) & ~(Vector.Count - 1)); + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) + { + Vector256 values0 = Vector256.Create(value0); + Vector256 values1 = Vector256.Create(value1); - // Get comparison Vector - Vector values0 = new Vector(value0); - Vector values1 = new Vector(value1); + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector256(ref searchStart, offset); + // Bitwise Or to combine the flagged matches for the second value to our match flags + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)) + .AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector256.Count; + continue; + } - while (length > 0) + goto IntrinsicsMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector256(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)) + .AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + + goto IntrinsicsMatch; + } + } + + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) { - // Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned - Debug.Assert(((int)pCh & (Unsafe.SizeOf>() - 1)) == 0); - Vector vData = Unsafe.Read>(pCh); - var vMatches = Vector.BitwiseOr( - Vector.Equals(vData, values0), - Vector.Equals(vData, values1)); - if (Vector.Zero.Equals(vMatches)) + search = LoadVector128(ref searchStart, offset); + + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)) + .AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) { - pCh += Vector.Count; - length -= Vector.Count; + // None matched + offset += (nuint)Vector128.Count; continue; } - // Find offset of first match - return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches); - } - if (pCh < pEndCh) + goto IntrinsicsMatch; + } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)) + .AsByte()); + if (matches == 0) { - length = (int)(pEndCh - pCh); - goto SequentialScan; + // None matched + goto NotFound; } } - return -1; - Found3: - pCh++; - Found2: - pCh++; - Found1: - pCh++; - Found: - return (int)(pCh - pChars); + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset, + // flags are in bytes so divide by 2 for chars (shift right by 1) + offset += (nuint)BitOperations.TrailingZeroCount(matches) >> 1; + goto Found; } - } - - [MethodImpl(MethodImplOptions.AggressiveOptimization)] - public static unsafe int IndexOfAny(ref char searchSpace, char value0, char value1, char value2, int length) - { - Debug.Assert(length >= 0); - fixed (char* pChars = &searchSpace) + VectorCompare: + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (!Sse2.IsSupported && Vector.IsHardwareAccelerated) { - char* pCh = pChars; - char* pEndCh = pCh + length; + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); - if (Vector.IsHardwareAccelerated && length >= Vector.Count * 2) + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) { - // Figure out how many characters to read sequentially until we are vector aligned - // This is equivalent to: - // unaligned = ((int)pCh % Unsafe.SizeOf>()) / elementsPerByte - // length = (Vector.Count - unaligned) % Vector.Count - const int elementsPerByte = sizeof(ushort) / sizeof(byte); - int unaligned = ((int)pCh & (Unsafe.SizeOf>() - 1)) / elementsPerByte; - length = (Vector.Count - unaligned) & (Vector.Count - 1); - } - - SequentialScan: - while (length >= 4) - { - length -= 4; - - if (pCh[0] == value0 || pCh[0] == value1 || pCh[0] == value2) - goto Found; - if (pCh[1] == value0 || pCh[1] == value1 || pCh[1] == value2) - goto Found1; - if (pCh[2] == value0 || pCh[2] == value1 || pCh[2] == value2) - goto Found2; - if (pCh[3] == value0 || pCh[3] == value1 || pCh[3] == value2) - goto Found3; + search = LoadVector(ref searchStart, offset); + search = Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)); + if (Vector.Zero.Equals(search)) + { + // None matched + offset += (nuint)Vector.Count; + continue; + } - pCh += 4; + goto Difference; } - while (length > 0) + // Move to Vector length from end for final compare + search = LoadVector(ref searchStart, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)); + if (Vector.Zero.Equals(search)) { - length--; - - if (pCh[0] == value0 || pCh[0] == value1 || pCh[0] == value2) - goto Found; - - pCh++; + // None matched + goto NotFound; } - // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow - // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. - if (Vector.IsHardwareAccelerated && pCh < pEndCh) - { - // Get the highest multiple of Vector.Count that is within the search space. - // That will be how many times we iterate in the loop below. - // This is equivalent to: length = Vector.Count * ((int)(pEndCh - pCh) / Vector.Count) - length = (int)((pEndCh - pCh) & ~(Vector.Count - 1)); - - // Get comparison Vector - Vector values0 = new Vector(value0); - Vector values1 = new Vector(value1); - Vector values2 = new Vector(value2); - - while (length > 0) - { - // Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned - Debug.Assert(((int)pCh & (Unsafe.SizeOf>() - 1)) == 0); - Vector vData = Unsafe.Read>(pCh); - var vMatches = Vector.BitwiseOr( - Vector.BitwiseOr( - Vector.Equals(vData, values0), - Vector.Equals(vData, values1)), - Vector.Equals(vData, values2)); - - if (Vector.Zero.Equals(vMatches)) - { - pCh += Vector.Count; - length -= Vector.Count; - continue; - } - // Find offset of first match - return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches); - } - - if (pCh < pEndCh) - { - length = (int)(pEndCh - pCh); - goto SequentialScan; - } - } - return -1; - Found3: - pCh++; - Found2: - pCh++; - Found1: - pCh++; - Found: - return (int)(pCh - pChars); + Difference: + offset += (nuint)LocateFirstFoundChar(search); } + + goto Found; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -1044,14 +1080,29 @@ private static int LocateLastFoundChar(ulong match) private static Vector LoadVector(ref char start, nint offset) => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, offset))); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector LoadVector(ref char start, nuint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, (nint)offset))); + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 LoadVector128(ref char start, nint offset) => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, offset))); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 LoadVector128(ref char start, nuint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, (nint)offset))); + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector256 LoadVector256(ref char start, nint offset) => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, offset))); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 LoadVector256(ref char start, nuint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, (nint)offset))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ref char Add(ref char start, nuint offset) => ref Unsafe.Add(ref start, (nint)offset); + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static nint GetCharVectorSpanLength(nint offset, nint length) => (length - offset) & ~(Vector.Count - 1); From 1126912c5ef58eca5724e3fe530dae85780be925 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Wed, 12 Aug 2020 19:54:49 +0100 Subject: [PATCH 2/7] Intrinsicify IndexOfAny(char,char,char) --- .../src/System/SpanHelpers.Char.cs | 250 ++++++++++++++++++ 1 file changed, 250 insertions(+) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index 1bb9edfcc733ea..52414aca687969 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -727,6 +727,256 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu goto Found; } + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + public static unsafe int IndexOfAny(ref char searchStart, char value0, char value1, char value2, int length) + { + Debug.Assert(length >= 0); + + nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)length; + + if (Sse2.IsSupported) + { + // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) + { + // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // We jump forward to the intrinsics at the end of them method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; + } + } + else if (Vector.IsHardwareAccelerated) + { + // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) + { + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto VectorCompare; + } + } + + int lookUp; + while (lengthToExamine >= 4) + { + ref char current = ref Add(ref searchStart, offset); + + lookUp = current; + if (value0 == lookUp || value1 == lookUp || value2 == lookUp) + goto Found; + lookUp = Unsafe.Add(ref current, 1); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp) + goto Found1; + lookUp = Unsafe.Add(ref current, 2); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp) + goto Found2; + lookUp = Unsafe.Add(ref current, 3); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp) + goto Found3; + + offset += 4; + lengthToExamine -= 4; + } + + while (lengthToExamine > 0) + { + lookUp = Add(ref searchStart, offset); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp) + goto Found; + + offset += 1; + lengthToExamine -= 1; + } + + NotFound: + return -1; + Found3: + return (int)(offset + 3); + Found2: + return (int)(offset + 2); + Found1: + return (int)(offset + 1); + Found: + return (int)offset; + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) + { + int matches; + if (Avx2.IsSupported) + { + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) + { + Vector256 values0 = Vector256.Create(value0); + Vector256 values1 = Vector256.Create(value1); + Vector256 values2 = Vector256.Create(value2); + + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector256(ref searchStart, offset); + // Bitwise Or to combine the flagged matches for the second value to our match flags + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)), + Avx2.CompareEqual(values2, search)) + .AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector256.Count; + continue; + } + + goto IntrinsicsMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector256(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)), + Avx2.CompareEqual(values2, search)) + .AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + + goto IntrinsicsMatch; + } + } + + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + Vector128 values2 = Vector128.Create(value2); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector128(ref searchStart, offset); + + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)), + Sse2.CompareEqual(values2, search)) + .AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector128.Count; + continue; + } + + goto IntrinsicsMatch; + } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)), + Sse2.CompareEqual(values2, search)) + .AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + } + + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset, + // flags are in bytes so divide by 2 for chars (shift right by 1) + offset += (nuint)BitOperations.TrailingZeroCount(matches) >> 1; + goto Found; + } + + VectorCompare: + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (!Sse2.IsSupported && Vector.IsHardwareAccelerated) + { + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + Vector values2 = new Vector(value2); + + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector(ref searchStart, offset); + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)); + if (Vector.Zero.Equals(search)) + { + // None matched + offset += (nuint)Vector.Count; + continue; + } + + goto Difference; + } + + // Move to Vector length from end for final compare + search = LoadVector(ref searchStart, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)); + if (Vector.Zero.Equals(search)) + { + // None matched + goto NotFound; + } + + Difference: + offset += (nuint)LocateFirstFoundChar(search); + } + + goto Found; + } + [MethodImpl(MethodImplOptions.AggressiveOptimization)] public static unsafe int IndexOfAny(ref char searchSpace, char value0, char value1, char value2, char value3, int length) { From a872cee974513709356d3e1fd5e01568d9653dce Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Wed, 12 Aug 2020 20:02:20 +0100 Subject: [PATCH 3/7] Intrinsicify IndexOfAny(char,char,char,char) --- .../src/System/SpanHelpers.Char.cs | 289 +++++++++++++----- 1 file changed, 217 insertions(+), 72 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index 52414aca687969..c0aeedf6c8feaa 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -978,106 +978,251 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu } [MethodImpl(MethodImplOptions.AggressiveOptimization)] - public static unsafe int IndexOfAny(ref char searchSpace, char value0, char value1, char value2, char value3, int length) + public static unsafe int IndexOfAny(ref char searchStart, char value0, char value1, char value2, char value3, int length) { Debug.Assert(length >= 0); - fixed (char* pChars = &searchSpace) - { - char* pCh = pChars; - char* pEndCh = pCh + length; + nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)length; - if (Vector.IsHardwareAccelerated && length >= Vector.Count * 2) + if (Sse2.IsSupported) + { + // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) { - // Figure out how many characters to read sequentially until we are vector aligned - // This is equivalent to: - // unaligned = ((int)pCh % Unsafe.SizeOf>()) / elementsPerByte - // length = (Vector.Count - unaligned) % Vector.Count - const int elementsPerByte = sizeof(ushort) / sizeof(byte); - int unaligned = ((int)pCh & (Unsafe.SizeOf>() - 1)) / elementsPerByte; - length = (Vector.Count - unaligned) & (Vector.Count - 1); + // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // We jump forward to the intrinsics at the end of them method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; } - - SequentialScan: - while (length >= 4) + } + else if (Vector.IsHardwareAccelerated) + { + // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) { - length -= 4; + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto VectorCompare; + } + } - if (pCh[0] == value0 || pCh[0] == value1 || pCh[0] == value2 || pCh[0] == value3) - goto Found; - if (pCh[1] == value0 || pCh[1] == value1 || pCh[1] == value2 || pCh[1] == value3) - goto Found1; - if (pCh[2] == value0 || pCh[2] == value1 || pCh[2] == value2 || pCh[2] == value3) - goto Found2; - if (pCh[3] == value0 || pCh[3] == value1 || pCh[3] == value2 || pCh[3] == value3) - goto Found3; + int lookUp; + while (lengthToExamine >= 4) + { + ref char current = ref Add(ref searchStart, offset); - pCh += 4; - } + lookUp = current; + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found; + lookUp = Unsafe.Add(ref current, 1); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found1; + lookUp = Unsafe.Add(ref current, 2); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found2; + lookUp = Unsafe.Add(ref current, 3); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found3; - while (length > 0) - { - length--; + offset += 4; + lengthToExamine -= 4; + } - if (pCh[0] == value0 || pCh[0] == value1 || pCh[0] == value2 || pCh[0] == value3) - goto Found; + while (lengthToExamine > 0) + { + lookUp = Add(ref searchStart, offset); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp) + goto Found; - pCh++; - } + offset += 1; + lengthToExamine -= 1; + } - // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow - // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. - if (Vector.IsHardwareAccelerated && pCh < pEndCh) + NotFound: + return -1; + Found3: + return (int)(offset + 3); + Found2: + return (int)(offset + 2); + Found1: + return (int)(offset + 1); + Found: + return (int)offset; + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) + { + int matches; + if (Avx2.IsSupported) { - // Get the highest multiple of Vector.Count that is within the search space. - // That will be how many times we iterate in the loop below. - // This is equivalent to: length = Vector.Count * ((int)(pEndCh - pCh) / Vector.Count) - length = (int)((pEndCh - pCh) & ~(Vector.Count - 1)); + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) + { + Vector256 values0 = Vector256.Create(value0); + Vector256 values1 = Vector256.Create(value1); + Vector256 values2 = Vector256.Create(value2); + Vector256 values3 = Vector256.Create(value3); - // Get comparison Vector - Vector values0 = new Vector(value0); - Vector values1 = new Vector(value1); - Vector values2 = new Vector(value2); - Vector values3 = new Vector(value3); + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector256(ref searchStart, offset); + // We preform the Or at non-Vector level as we are using the maximum number of non-preserved registers, + // and more causes them first to be pushed to stack and then popped on exit to preseve their values. + matches = Avx2.MoveMask(Avx2.CompareEqual(values0, search).AsByte()); + // Bitwise Or to combine the flagged matches for the second, third and fourth values to our match flags + matches |= Avx2.MoveMask(Avx2.CompareEqual(values1, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values2, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values3, search).AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector256.Count; + continue; + } - while (length > 0) + goto IntrinsicsMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector256(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask(Avx2.CompareEqual(values0, search).AsByte()); + // Bitwise Or to combine the flagged matches for the second, third and fourth values to our match flags + matches |= Avx2.MoveMask(Avx2.CompareEqual(values1, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values2, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values3, search).AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + + goto IntrinsicsMatch; + } + } + + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + Vector128 values2 = Vector128.Create(value2); + Vector128 values3 = Vector128.Create(value3); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) { - // Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned - Debug.Assert(((int)pCh & (Unsafe.SizeOf>() - 1)) == 0); - Vector vData = Unsafe.Read>(pCh); - var vMatches = Vector.BitwiseOr( - Vector.BitwiseOr( - Vector.BitwiseOr(Vector.Equals(vData, values0), Vector.Equals(vData, values1)), - Vector.Equals(vData, values2)), - Vector.Equals(vData, values3)); + search = LoadVector128(ref searchStart, offset); - if (Vector.Zero.Equals(vMatches)) + matches = Sse2.MoveMask(Sse2.CompareEqual(values0, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values1, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values2, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values3, search).AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) { - pCh += Vector.Count; - length -= Vector.Count; + // None matched + offset += (nuint)Vector128.Count; continue; } - // Find offset of first match - return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches); + + goto IntrinsicsMatch; } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask(Sse2.CompareEqual(values0, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values1, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values2, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values3, search).AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + } - if (pCh < pEndCh) + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset, + // flags are in bytes so divide by 2 for chars (shift right by 1) + offset += (nuint)BitOperations.TrailingZeroCount(matches) >> 1; + goto Found; + } + + VectorCompare: + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (!Sse2.IsSupported && Vector.IsHardwareAccelerated) + { + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + Vector values2 = new Vector(value2); + Vector values3 = new Vector(value3); + + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector(ref searchStart, offset); + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)), + Vector.Equals(search, values3)); + if (Vector.Zero.Equals(search)) { - length = (int)(pEndCh - pCh); - goto SequentialScan; + // None matched + offset += (nuint)Vector.Count; + continue; } + + goto Difference; } - return -1; - Found3: - pCh++; - Found2: - pCh++; - Found1: - pCh++; - Found: - return (int)(pCh - pChars); + // Move to Vector length from end for final compare + search = LoadVector(ref searchStart, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)), + Vector.Equals(search, values3)); + if (Vector.Zero.Equals(search)) + { + // None matched + goto NotFound; + } + + Difference: + offset += (nuint)LocateFirstFoundChar(search); } + + goto Found; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] From ebe40b6921f5f9d3da6bd0925071832d52f24537 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Wed, 12 Aug 2020 20:07:07 +0100 Subject: [PATCH 4/7] Intrinsicify IndexOfAny(char,char,char,char,char) --- .../src/System/SpanHelpers.Char.cs | 304 +++++++++++++----- 1 file changed, 229 insertions(+), 75 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index c0aeedf6c8feaa..9a7b6421c3e402 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -1226,109 +1226,263 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu } [MethodImpl(MethodImplOptions.AggressiveOptimization)] - public static unsafe int IndexOfAny(ref char searchSpace, char value0, char value1, char value2, char value3, char value4, int length) + public static unsafe int IndexOfAny(ref char searchStart, char value0, char value1, char value2, char value3, char value4, int length) { Debug.Assert(length >= 0); - fixed (char* pChars = &searchSpace) - { - char* pCh = pChars; - char* pEndCh = pCh + length; + nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations + nuint lengthToExamine = (nuint)length; - if (Vector.IsHardwareAccelerated && length >= Vector.Count * 2) + if (Sse2.IsSupported) + { + // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) { - // Figure out how many characters to read sequentially until we are vector aligned - // This is equivalent to: - // unaligned = ((int)pCh % Unsafe.SizeOf>()) / elementsPerByte - // length = (Vector.Count - unaligned) % Vector.Count - const int elementsPerByte = sizeof(ushort) / sizeof(byte); - int unaligned = ((int)pCh & (Unsafe.SizeOf>() - 1)) / elementsPerByte; - length = (Vector.Count - unaligned) & (Vector.Count - 1); + // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // We jump forward to the intrinsics at the end of them method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; } - - SequentialScan: - while (length >= 4) + } + else if (Vector.IsHardwareAccelerated) + { + // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) { - length -= 4; + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto VectorCompare; + } + } - if (pCh[0] == value0 || pCh[0] == value1 || pCh[0] == value2 || pCh[0] == value3 || pCh[0] == value4) - goto Found; - if (pCh[1] == value0 || pCh[1] == value1 || pCh[1] == value2 || pCh[1] == value3 || pCh[1] == value4) - goto Found1; - if (pCh[2] == value0 || pCh[2] == value1 || pCh[2] == value2 || pCh[2] == value3 || pCh[2] == value4) - goto Found2; - if (pCh[3] == value0 || pCh[3] == value1 || pCh[3] == value2 || pCh[3] == value3 || pCh[3] == value4) - goto Found3; + int lookUp; + while (lengthToExamine >= 4) + { + ref char current = ref Add(ref searchStart, offset); - pCh += 4; - } + lookUp = current; + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp || value4 == lookUp) + goto Found; + lookUp = Unsafe.Add(ref current, 1); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp || value4 == lookUp) + goto Found1; + lookUp = Unsafe.Add(ref current, 2); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp || value4 == lookUp) + goto Found2; + lookUp = Unsafe.Add(ref current, 3); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp || value4 == lookUp) + goto Found3; - while (length > 0) - { - length--; + offset += 4; + lengthToExamine -= 4; + } - if (pCh[0] == value0 || pCh[0] == value1 || pCh[0] == value2 || pCh[0] == value3 || pCh[0] == value4) - goto Found; + while (lengthToExamine > 0) + { + lookUp = Add(ref searchStart, offset); + if (value0 == lookUp || value1 == lookUp || value2 == lookUp || value3 == lookUp || value4 == lookUp) + goto Found; - pCh++; - } + offset += 1; + lengthToExamine -= 1; + } - // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow - // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. - if (Vector.IsHardwareAccelerated && pCh < pEndCh) + NotFound: + return -1; + Found3: + return (int)(offset + 3); + Found2: + return (int)(offset + 2); + Found1: + return (int)(offset + 1); + Found: + return (int)offset; + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) + { + int matches; + if (Avx2.IsSupported) { - // Get the highest multiple of Vector.Count that is within the search space. - // That will be how many times we iterate in the loop below. - // This is equivalent to: length = Vector.Count * ((int)(pEndCh - pCh) / Vector.Count) - length = (int)((pEndCh - pCh) & ~(Vector.Count - 1)); + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) + { + Vector256 values0 = Vector256.Create(value0); + Vector256 values1 = Vector256.Create(value1); + Vector256 values2 = Vector256.Create(value2); + Vector256 values3 = Vector256.Create(value3); - // Get comparison Vector - Vector values0 = new Vector(value0); - Vector values1 = new Vector(value1); - Vector values2 = new Vector(value2); - Vector values3 = new Vector(value3); - Vector values4 = new Vector(value4); + Vector256 values4 = Vector256.Create(value4); - while (length > 0) + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector256(ref searchStart, offset); + // We preform the Or at non-Vector level as we are using the maximum number of non-preserved registers (+ 1), + // and more causes them first to be pushed to stack and then popped on exit to preseve their values. + matches = Avx2.MoveMask(Avx2.CompareEqual(values0, search).AsByte()); + // Bitwise Or to combine the flagged matches for the second, third and fourth values to our match flags + matches |= Avx2.MoveMask(Avx2.CompareEqual(values1, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values2, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values3, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values4, search).AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // None matched + offset += (nuint)Vector256.Count; + continue; + } + + goto IntrinsicsMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector256(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask(Avx2.CompareEqual(values0, search).AsByte()); + // Bitwise Or to combine the flagged matches for the second, third and fourth values to our match flags + matches |= Avx2.MoveMask(Avx2.CompareEqual(values1, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values2, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values3, search).AsByte()); + matches |= Avx2.MoveMask(Avx2.CompareEqual(values4, search).AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; + } + + goto IntrinsicsMatch; + } + } + + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + Vector128 values2 = Vector128.Create(value2); + Vector128 values3 = Vector128.Create(value3); + Vector128 values4 = Vector128.Create(value4); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) { - // Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned - Debug.Assert(((int)pCh & (Unsafe.SizeOf>() - 1)) == 0); - Vector vData = Unsafe.Read>(pCh); - var vMatches = Vector.BitwiseOr( - Vector.BitwiseOr( - Vector.BitwiseOr( - Vector.BitwiseOr(Vector.Equals(vData, values0), Vector.Equals(vData, values1)), - Vector.Equals(vData, values2)), - Vector.Equals(vData, values3)), - Vector.Equals(vData, values4)); + search = LoadVector128(ref searchStart, offset); - if (Vector.Zero.Equals(vMatches)) + matches = Sse2.MoveMask(Sse2.CompareEqual(values0, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values1, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values2, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values3, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values4, search).AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) { - pCh += Vector.Count; - length -= Vector.Count; + // None matched + offset += (nuint)Vector128.Count; continue; } - // Find offset of first match - return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches); + + goto IntrinsicsMatch; + } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchStart, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask(Sse2.CompareEqual(values0, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values1, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values2, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values3, search).AsByte()); + matches |= Sse2.MoveMask(Sse2.CompareEqual(values4, search).AsByte()); + if (matches == 0) + { + // None matched + goto NotFound; } + } - if (pCh < pEndCh) + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset, + // flags are in bytes so divide by 2 for chars (shift right by 1) + offset += (nuint)BitOperations.TrailingZeroCount(matches) >> 1; + goto Found; + } + + VectorCompare: + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (!Sse2.IsSupported && Vector.IsHardwareAccelerated) + { + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + Vector values2 = new Vector(value2); + Vector values3 = new Vector(value3); + Vector values4 = new Vector(value4); + + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector(ref searchStart, offset); + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)), + Vector.Equals(search, values3)), + Vector.Equals(search, values4)); + if (Vector.Zero.Equals(search)) { - length = (int)(pEndCh - pCh); - goto SequentialScan; + // None matched + offset += (nuint)Vector.Count; + continue; } + + goto Difference; } - return -1; - Found3: - pCh++; - Found2: - pCh++; - Found1: - pCh++; - Found: - return (int)(pCh - pChars); + // Move to Vector length from end for final compare + search = LoadVector(ref searchStart, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)), + Vector.Equals(search, values3)), + Vector.Equals(search, values4)); + if (Vector.Zero.Equals(search)) + { + // None matched + goto NotFound; + } + + Difference: + offset += (nuint)LocateFirstFoundChar(search); } + + goto Found; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] From 44d524e8b039253405c78c5272a1be3eefd31284 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Thu, 13 Aug 2020 02:52:13 +0100 Subject: [PATCH 5/7] Avoid movsx --- .../src/System/SpanHelpers.Char.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index 9a7b6421c3e402..506ec373b3f644 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -498,7 +498,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu Debug.Assert(length >= 0); nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations - nuint lengthToExamine = (nuint)length; + nuint lengthToExamine = (nuint)(uint)length; if (Sse2.IsSupported) { @@ -679,7 +679,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu IntrinsicsMatch: // Find bitflag offset of first difference and add to current offset, // flags are in bytes so divide by 2 for chars (shift right by 1) - offset += (nuint)BitOperations.TrailingZeroCount(matches) >> 1; + offset += (nuint)(uint)BitOperations.TrailingZeroCount(matches) >> 1; goto Found; } @@ -721,7 +721,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu } Difference: - offset += (nuint)LocateFirstFoundChar(search); + offset += (nuint)(uint)LocateFirstFoundChar(search); } goto Found; @@ -733,7 +733,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu Debug.Assert(length >= 0); nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations - nuint lengthToExamine = (nuint)length; + nuint lengthToExamine = (nuint)(uint)length; if (Sse2.IsSupported) { @@ -924,7 +924,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu IntrinsicsMatch: // Find bitflag offset of first difference and add to current offset, // flags are in bytes so divide by 2 for chars (shift right by 1) - offset += (nuint)BitOperations.TrailingZeroCount(matches) >> 1; + offset += (nuint)(uint)BitOperations.TrailingZeroCount(matches) >> 1; goto Found; } @@ -971,7 +971,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu } Difference: - offset += (nuint)LocateFirstFoundChar(search); + offset += (nuint)(uint)LocateFirstFoundChar(search); } goto Found; @@ -983,7 +983,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu Debug.Assert(length >= 0); nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations - nuint lengthToExamine = (nuint)length; + nuint lengthToExamine = (nuint)(uint)length; if (Sse2.IsSupported) { @@ -1167,7 +1167,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu IntrinsicsMatch: // Find bitflag offset of first difference and add to current offset, // flags are in bytes so divide by 2 for chars (shift right by 1) - offset += (nuint)BitOperations.TrailingZeroCount(matches) >> 1; + offset += (nuint)(uint)BitOperations.TrailingZeroCount(matches) >> 1; goto Found; } @@ -1219,7 +1219,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu } Difference: - offset += (nuint)LocateFirstFoundChar(search); + offset += (nuint)(uint)LocateFirstFoundChar(search); } goto Found; @@ -1231,7 +1231,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu Debug.Assert(length >= 0); nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations - nuint lengthToExamine = (nuint)length; + nuint lengthToExamine = (nuint)(uint)length; if (Sse2.IsSupported) { @@ -1422,7 +1422,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu IntrinsicsMatch: // Find bitflag offset of first difference and add to current offset, // flags are in bytes so divide by 2 for chars (shift right by 1) - offset += (nuint)BitOperations.TrailingZeroCount(matches) >> 1; + offset += (nuint)(uint)BitOperations.TrailingZeroCount(matches) >> 1; goto Found; } @@ -1479,7 +1479,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu } Difference: - offset += (nuint)LocateFirstFoundChar(search); + offset += (nuint)(uint)LocateFirstFoundChar(search); } goto Found; From 006487edac309220d55b811229161b98b5f8c2ee Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Fri, 14 Aug 2020 22:32:09 +0100 Subject: [PATCH 6/7] Feedback --- .../src/System/SpanHelpers.Char.cs | 72 +++++++++++-------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index 506ec373b3f644..35a52553557f69 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -502,11 +502,11 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu if (Sse2.IsSupported) { - // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. nint vectorDiff = (nint)length - Vector128.Count; if (vectorDiff >= 0) { - // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // >= Sse2 intrinsics are supported and length is enough to use them, so use that path. // We jump forward to the intrinsics at the end of them method so a naive branch predict // will choose the non-intrinsic path so short lengths which don't gain anything aren't // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths @@ -517,7 +517,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu } else if (Vector.IsHardwareAccelerated) { - // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. nint vectorDiff = (nint)length - Vector.Count; if (vectorDiff >= 0) { @@ -705,7 +705,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu continue; } - goto Difference; + goto VectorMatch; } // Move to Vector length from end for final compare @@ -720,11 +720,15 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu goto NotFound; } - Difference: + VectorMatch: offset += (nuint)(uint)LocateFirstFoundChar(search); + goto Found; + } + else + { + Debug.Fail("Unreachable"); + goto Found; } - - goto Found; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -737,11 +741,11 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu if (Sse2.IsSupported) { - // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. nint vectorDiff = (nint)length - Vector128.Count; if (vectorDiff >= 0) { - // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // >= Sse2 intrinsics are supported and length is enough to use them, so use that path. // We jump forward to the intrinsics at the end of them method so a naive branch predict // will choose the non-intrinsic path so short lengths which don't gain anything aren't // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths @@ -752,7 +756,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu } else if (Vector.IsHardwareAccelerated) { - // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. nint vectorDiff = (nint)length - Vector.Count; if (vectorDiff >= 0) { @@ -953,7 +957,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu continue; } - goto Difference; + goto VectorMatch; } // Move to Vector length from end for final compare @@ -970,11 +974,15 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu goto NotFound; } - Difference: + VectorMatch: offset += (nuint)(uint)LocateFirstFoundChar(search); + goto Found; + } + else + { + Debug.Fail("Unreachable"); + goto Found; } - - goto Found; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -987,11 +995,11 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu if (Sse2.IsSupported) { - // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. nint vectorDiff = (nint)length - Vector128.Count; if (vectorDiff >= 0) { - // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // >= Sse2 intrinsics are supported and length is enough to use them, so use that path. // We jump forward to the intrinsics at the end of them method so a naive branch predict // will choose the non-intrinsic path so short lengths which don't gain anything aren't // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths @@ -1002,7 +1010,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu } else if (Vector.IsHardwareAccelerated) { - // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. nint vectorDiff = (nint)length - Vector.Count; if (vectorDiff >= 0) { @@ -1199,7 +1207,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu continue; } - goto Difference; + goto VectorMatch; } // Move to Vector length from end for final compare @@ -1218,11 +1226,15 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu goto NotFound; } - Difference: + VectorMatch: offset += (nuint)(uint)LocateFirstFoundChar(search); + goto Found; + } + else + { + Debug.Fail("Unreachable"); + goto Found; } - - goto Found; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -1235,11 +1247,11 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu if (Sse2.IsSupported) { - // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. nint vectorDiff = (nint)length - Vector128.Count; if (vectorDiff >= 0) { - // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // >= Sse2 intrinsics are supported and length is enough to use them, so use that path. // We jump forward to the intrinsics at the end of them method so a naive branch predict // will choose the non-intrinsic path so short lengths which don't gain anything aren't // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths @@ -1250,7 +1262,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu } else if (Vector.IsHardwareAccelerated) { - // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + // Calculate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. nint vectorDiff = (nint)length - Vector.Count; if (vectorDiff >= 0) { @@ -1457,7 +1469,7 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu continue; } - goto Difference; + goto VectorMatch; } // Move to Vector length from end for final compare @@ -1478,11 +1490,15 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu goto NotFound; } - Difference: + VectorMatch: offset += (nuint)(uint)LocateFirstFoundChar(search); + goto Found; + } + else + { + Debug.Fail("Unreachable"); + goto Found; } - - goto Found; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] From bef116ed87412335d1adfe70cdc974fa1befd52c Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Sat, 15 Aug 2020 00:26:22 +0100 Subject: [PATCH 7/7] Super nit --- .../src/System/SpanHelpers.Char.cs | 32 +++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index 35a52553557f69..4c4070ef82e00a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -724,11 +724,9 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu offset += (nuint)(uint)LocateFirstFoundChar(search); goto Found; } - else - { - Debug.Fail("Unreachable"); - goto Found; - } + + Debug.Fail("Unreachable"); + goto NotFound; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -978,11 +976,9 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu offset += (nuint)(uint)LocateFirstFoundChar(search); goto Found; } - else - { - Debug.Fail("Unreachable"); - goto Found; - } + + Debug.Fail("Unreachable"); + goto NotFound; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -1230,11 +1226,9 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu offset += (nuint)(uint)LocateFirstFoundChar(search); goto Found; } - else - { - Debug.Fail("Unreachable"); - goto Found; - } + + Debug.Fail("Unreachable"); + goto NotFound; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -1494,11 +1488,9 @@ public static unsafe int IndexOfAny(ref char searchStart, char value0, char valu offset += (nuint)(uint)LocateFirstFoundChar(search); goto Found; } - else - { - Debug.Fail("Unreachable"); - goto Found; - } + + Debug.Fail("Unreachable"); + goto NotFound; } [MethodImpl(MethodImplOptions.AggressiveOptimization)]