Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Factor positive lookaheads better into find optimizations #112107

Merged
merged 2 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

#if SYSTEM_TEXT_REGULAREXPRESSIONS
using System.Buffers;
#endif
using System.Collections.Generic;
using System.Diagnostics;

Expand All @@ -10,28 +12,55 @@ namespace System.Text.RegularExpressions
/// <summary>Contains state and provides operations related to finding the next location a match could possibly begin.</summary>
internal sealed class RegexFindOptimizations
{
/// <summary>True if the input should be processed right-to-left rather than left-to-right.</summary>
private readonly bool _rightToLeft;
/// <summary>Lookup table used for optimizing ASCII when doing set queries.</summary>
private readonly uint[]?[]? _asciiLookups;

public RegexFindOptimizations(RegexNode root, RegexOptions options)
public static RegexFindOptimizations Create(RegexNode root, RegexOptions options)
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
{
RegexFindOptimizations opts = new(root, options, isLeadingPartial: false);

if ((options & RegexOptions.RightToLeft) == 0 &&
!opts.IsUseful &&
RegexPrefixAnalyzer.FindLeadingPositiveLookahead(root) is RegexNode positiveLookahead)
{
RegexFindOptimizations positiveLookaheadOpts = new(positiveLookahead.Child(0), options, isLeadingPartial: true);

// Fixups to incorporate relevant information from the original optimizations.
// - If the original has a larger minimum length than the lookahead, use it. Lookaheads don't currently factor into
// the computation of the minimum as it complicates the logic due to them possibly overlapping with other portions.
// - Use whatever max came from the original, if any. We shouldn't have computed a max for the lookahead because
// it's partial.
positiveLookaheadOpts.MinRequiredLength = Math.Max(opts.MinRequiredLength, positiveLookaheadOpts.MinRequiredLength);
positiveLookaheadOpts.MaxPossibleLength = opts.MaxPossibleLength;

opts = positiveLookaheadOpts;
}

return opts;
}

/// <summary>Creates optimization information for searching with the pattern represented by <paramref name="root"/>.</summary>
/// <param name="root">The root of the pattern node tree.</param>
/// <param name="options">Options used when creating the regex.</param>
/// <param name="isLeadingPartial">true if <paramref name="root"/> may not represent the whole pattern, only a leading node in it.</param>
private RegexFindOptimizations(RegexNode root, RegexOptions options, bool isLeadingPartial)
{
_rightToLeft = (options & RegexOptions.RightToLeft) != 0;
bool rightToLeft = (options & RegexOptions.RightToLeft) != 0;
Debug.Assert(!isLeadingPartial || !rightToLeft, "RightToLeft unexpected when isLeadingPartial");

MinRequiredLength = root.ComputeMinLength();

// Compute any anchor starting the expression. If there is one, we won't need to search for anything,
// as we can just match at that single location.
LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(root);
if (_rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
if (rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
{
// Filter out Bol for RightToLeft, as we don't currently optimize for it.
LeadingAnchor = RegexNodeKind.Unknown;
}
if (LeadingAnchor is RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.EndZ or RegexNodeKind.End)
{
FindMode = (LeadingAnchor, _rightToLeft) switch
FindMode = (LeadingAnchor, rightToLeft) switch
{
(RegexNodeKind.Beginning, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning,
(RegexNodeKind.Beginning, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning,
Expand All @@ -47,7 +76,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)

// Compute any anchor trailing the expression. If there is one, and we can also compute a fixed length
// for the whole expression, we can use that to quickly jump to the right location in the input.
if (!_rightToLeft) // haven't added FindNextStartingPositionMode trailing anchor support for RTL
if (!rightToLeft && // haven't added FindNextStartingPositionMode trailing anchor support for RTL
!isLeadingPartial) // trailing anchors in a partial root aren't relevant
{
TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(root);
if (TrailingAnchor is RegexNodeKind.End or RegexNodeKind.EndZ &&
Expand All @@ -70,7 +100,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
if (prefix.Length > 1)
{
LeadingPrefix = prefix;
FindMode = _rightToLeft ?
FindMode = rightToLeft ?
FindNextStartingPositionMode.LeadingString_RightToLeft :
FindNextStartingPositionMode.LeadingString_LeftToRight;
return;
Expand All @@ -89,7 +119,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
// more expensive; someone who wants to pay to do more work can specify Compiled. So for the interpreter
// we focus only on creating a set for the first character. Same for right-to-left, which is used very
// rarely and thus we don't need to invest in special-casing it.
if (_rightToLeft)
if (rightToLeft)
{
// Determine a set for anything that can possibly start the expression.
if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
Expand Down Expand Up @@ -253,21 +283,21 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch;

/// <summary>Gets the leading anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
public RegexNodeKind LeadingAnchor { get; }
public RegexNodeKind LeadingAnchor { get; private set; }

/// <summary>Gets the trailing anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
public RegexNodeKind TrailingAnchor { get; }

/// <summary>Gets the minimum required length an input need be to match the pattern.</summary>
/// <remarks>0 is a valid minimum length. This value may also be the max (and hence fixed) length of the expression.</remarks>
public int MinRequiredLength { get; }
public int MinRequiredLength { get; private set; }

/// <summary>The maximum possible length an input could be to match the pattern.</summary>
/// <remarks>
/// This is currently only set when <see cref="TrailingAnchor"/> is found to be an end anchor.
/// That can be expanded in the future as needed.
/// </remarks>
public int? MaxPossibleLength { get; }
public int? MaxPossibleLength { get; private set; }

/// <summary>Gets the leading prefix. May be an empty string.</summary>
public string LeadingPrefix { get; } = string.Empty;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -940,7 +940,7 @@ private RegexNode ReduceAlternation()
node = ExtractCommonPrefixText(node);
if (node.Kind == RegexNodeKind.Alternate)
{
node = ExtractCommonPrefixOneNotoneSet(node);
node = ExtractCommonPrefixNode(node);
if (node.Kind == RegexNodeKind.Alternate)
{
node = RemoveRedundantEmptiesAndNothings(node);
Expand Down Expand Up @@ -1072,7 +1072,7 @@ void ReduceSingleLetterAndNestedAlternations()
// This function optimizes out prefix nodes from alternation branches that are
// the same across multiple contiguous branches.
// e.g. \w12|\d34|\d56|\w78|\w90 => \w12|\d(?:34|56)|\w(?:78|90)
static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
static RegexNode ExtractCommonPrefixNode(RegexNode alternation)
{
Debug.Assert(alternation.Kind == RegexNodeKind.Alternate);
Debug.Assert(alternation.Children is List<RegexNode> { Count: >= 2 });
Expand All @@ -1097,7 +1097,7 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
{
Debug.Assert(children[startingIndex].Children is List<RegexNode> { Count: >= 2 });

// Only handle the case where each branch begins with the same One, Notone, or Set (individual or loop).
// Only handle the case where each branch begins with the same One, Notone, Set (individual or loop), or Anchor.
// Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing
// it for non-atomic variable length loops could change behavior as each branch could otherwise have a
// different number of characters consumed by the loop based on what's after it.
Expand All @@ -1107,6 +1107,10 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
case RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set:
case RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic:
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when required.M == required.N:
case RegexNodeKind.Beginning or RegexNodeKind.Bol or RegexNodeKind.Start
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
or RegexNodeKind.End or RegexNodeKind.EndZ
or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary
or RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary:
break;

default:
Expand Down
Loading