Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2287,10 +2287,23 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
{
switch (subsequent.Kind)
{
// Concatenate, capture, and atomic do not impact what comes at the beginning of their children,
// so we can skip down to the first child.
case RegexNodeKind.Concatenate:
case RegexNodeKind.Capture:
case RegexNodeKind.Atomic:

// Similarly, as long as a loop is guaranteed to iterate at least once, we can skip down to the child,
// as whatever starts it is guaranteed to come after the predecessor.
case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when subsequent.M > 0:

// Positive lookaheads can also be skipped through. The lookahead logically comes after the predecessor,
// and even though it's zero width, we don't need to look at whatever comes after the lookahead, because
// the lookahead ends up overlapping with its successor. If the node is disjoint from the lookahead, then
// it's also disjoint from the intersection of the lookahead and the lookahead's successor, since the
// intersection can only narrow the possible set of characters that need to be considered for overlap with
// the predecessor node.
case RegexNodeKind.PositiveLookaround when (subsequent.Options & RegexOptions.RightToLeft) == 0:
subsequent = subsequent.Child(0);
continue;
}
Expand Down Expand Up @@ -2330,13 +2343,6 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
// If it doesn't, then we can upgrade it to being atomic to avoid unnecessary backtracking.
switch (node.Kind)
{
case RegexNodeKind when iterateNullableSubsequent && subsequent.Kind is RegexNodeKind.PositiveLookaround:
if (!CanBeMadeAtomic(node, subsequent.Child(0), iterateNullableSubsequent: false, allowLazy: allowLazy))
{
return false;
}
break;

case RegexNodeKind.Oneloop:
case RegexNodeKind.Onelazy when allowLazy:
switch (subsequent.Kind)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,18 @@ public class RegexReductionTests
[InlineData(@"abc(?=\A)", @"abc\A")]
[InlineData(@"abc(?=$)", @"abc$")]
[InlineData(@"a*(?=b)bcd", @"(?>a*)(?=b)bcd")]
[InlineData(@"a*(?=b)a", @"(?>a*)(?=b)a")]
[InlineData(@"a+(?=\b)a", @"(?>a+)(?=\b)a")]
[InlineData(@"(?=(?!abc))", @"(?!abc)")]
[InlineData(@"(?=(?:abc))", @"(?=abc)")]
[InlineData(@"(?!((?:a)b(?:c)))", @"(?!abc)")]
[InlineData(@"(?=abc|abd)", @"(?=ab[cd])")]
[InlineData(@"(?!abc|abd)", @"(?!ab[cd])")]
[InlineData(@"(?<!(abc))", @"(?<!abc)")]
[InlineData(@".*\n", @"(?>.*)\n")]
[InlineData(@".*\n+", @"(?>.*)(?>\n+)")]
[InlineData(@"(?((?=a))b)", @"(?((?=a))b|)")]
[InlineData(@"(?((?!a))b)", @"(?((?!a))b|)")]
// Alternation reduction
[InlineData("a|b", "[ab]")]
[InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")]
Expand All @@ -313,6 +325,15 @@ public class RegexReductionTests
[InlineData("(?:a|)a", "a{1,2}")]
[InlineData("(?:a|)a*", "a*")]
[InlineData("a+(?:a|)", "a+")]
[InlineData(@"ab*(?=c)", @"a(?>b*)(?=c)")]
[InlineData(@"\d+(?=\D)", @"(?>\d+)(?=\D)")]
[InlineData(@"\D+(?=\d)", @"(?>\D+)(?=\d)")]
[InlineData(@"\s+(?=\S)", @"(?>\s+)(?=\S)")]
[InlineData(@"\S+(?=\s)", @"(?>\S+)(?=\s)")]
[InlineData(@"[^\n]+(?=\n)", @"(?>[^\n]+)(?=\n)")]
[InlineData(@"[a-f]+(?=[^a-f])", @"(?>[a-f]+)(?=[^a-f])")]
[InlineData(@"[0-9]*(?=[^0-9])", @"(?>[0-9]*)(?=[^0-9])")]
[InlineData(@"a*(?=b)(?=bc)", @"(?>a*)(?=b)(?=bc)")]
// [InlineData("abcde|abcdef", "abcde(?>|f)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree
[InlineData("abcdef|abcde", "abcde(?>f|)")]
[InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")]
Expand Down Expand Up @@ -575,6 +596,17 @@ public void PatternsReduceIdentically(string actual, string expected)
[InlineData("(abc?)*?d", "(?>(ab(?>c?))*)d")]
[InlineData("(aba)+d", "(?>(aba)+)d")]
[InlineData("(abc*)*d", "(?>(ab(?>c*))*)d")]
[InlineData(@"a*?(?=b)", @"(?>a*?)(?=b)")]
[InlineData(@"a+?(?=b)", @"(?>a+?)(?=b)")]
[InlineData(@"a{1,3}?(?=b)", @"(?>a{1,3}?)(?=b)")]
[InlineData(@"[ab]*(?=a)", @"(?>[ab]*)(?=a)")]
[InlineData(@"\w*(?=\b)", @"(?>\w*)(?=\b)")]
[InlineData(@".*(?=\b)", @"(?>.*)(?=\b)")]
[InlineData(@".*(?=^)", @"(?>.*)(?=^)")]
[InlineData(@"a*(?<=a)", @"(?>a*)(?<=a)")]
[InlineData(@"a+(?<=a)", @"(?>a+)(?<=a)")]
[InlineData(@"\d*(?<=\d)", @"(?>\d*)(?<=\d)")]
[InlineData(@"[ab]*?(?!a)", @"(?>[ab]*?)(?!a)")]
// Lookaround reduction
[InlineData("(?=(abc))", "(?=abc)")]
[InlineData("(?=a(b*)c)", "(?=ab*c)")]
Expand All @@ -583,6 +615,15 @@ public void PatternsReduceIdentically(string actual, string expected)
[InlineData(@"a*(?!b)b", @"(?>a*)(?!b)b")]
[InlineData(@"a*(?<!b)cde", @"(?>a*)(?<!b)cde")]
[InlineData(@"a*(?<=b)cde", @"(?>a*)(?<=b)cde")]
[InlineData(@"a*(?=)a", @"(?>a*)(?=)a")]
[InlineData(@"(?<=(ab))", @"(?<=ab)")]
[InlineData(@"(?=(ab|ac))", @"(?=a[bc])")]
[InlineData(@"(?<=(ab|ac))", @"(?<=a[bc])")]
[InlineData(@"(?=ab)|ac", @"a(?=b|c)")]
[InlineData(@"(?=ab)c", @"a(?=bc)")]
[InlineData(@"(?!ab)c", @"a(?!bc)")]
[InlineData(@"(?=a)(?=b)", @"(?=ab)")]
[InlineData(@"(?!ab)(?!ac)", @"(?!a[bc])")]
// Loops inside alternation constructs
[InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
[InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]
Expand Down
Loading