diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 0b0c0a8dd6a672..3dce343ea8867a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2287,10 +2287,23 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i { switch (subsequent.Kind) { + // Concatenate, capture, and atomic do not impact what comes at the beginning of their children, + // so we can skip down to the first child. case RegexNodeKind.Concatenate: case RegexNodeKind.Capture: case RegexNodeKind.Atomic: + + // Similarly, as long as a loop is guaranteed to iterate at least once, we can skip down to the child, + // as whatever starts it is guaranteed to come after the predecessor. case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when subsequent.M > 0: + + // Positive lookaheads can also be skipped through. The lookahead logically comes after the predecessor, + // and even though it's zero width, we don't need to look at whatever comes after the lookahead, because + // the lookahead ends up overlapping with its successor. If the node is disjoint from the lookahead, then + // it's also disjoint from the intersection of the lookahead and the lookahead's successor, since the + // intersection can only narrow the possible set of characters that need to be considered for overlap with + // the predecessor node. + case RegexNodeKind.PositiveLookaround when (subsequent.Options & RegexOptions.RightToLeft) == 0: subsequent = subsequent.Child(0); continue; } @@ -2330,13 +2343,6 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i // If it doesn't, then we can upgrade it to being atomic to avoid unnecessary backtracking. switch (node.Kind) { - case RegexNodeKind when iterateNullableSubsequent && subsequent.Kind is RegexNodeKind.PositiveLookaround: - if (!CanBeMadeAtomic(node, subsequent.Child(0), iterateNullableSubsequent: false, allowLazy: allowLazy)) - { - return false; - } - break; - case RegexNodeKind.Oneloop: case RegexNodeKind.Onelazy when allowLazy: switch (subsequent.Kind) diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index 68fb3925972dd1..bca10c010db7d0 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -289,6 +289,18 @@ public class RegexReductionTests [InlineData(@"abc(?=\A)", @"abc\A")] [InlineData(@"abc(?=$)", @"abc$")] [InlineData(@"a*(?=b)bcd", @"(?>a*)(?=b)bcd")] + [InlineData(@"a*(?=b)a", @"(?>a*)(?=b)a")] + [InlineData(@"a+(?=\b)a", @"(?>a+)(?=\b)a")] + [InlineData(@"(?=(?!abc))", @"(?!abc)")] + [InlineData(@"(?=(?:abc))", @"(?=abc)")] + [InlineData(@"(?!((?:a)b(?:c)))", @"(?!abc)")] + [InlineData(@"(?=abc|abd)", @"(?=ab[cd])")] + [InlineData(@"(?!abc|abd)", @"(?!ab[cd])")] + [InlineData(@"(?.*)\n")] + [InlineData(@".*\n+", @"(?>.*)(?>\n+)")] + [InlineData(@"(?((?=a))b)", @"(?((?=a))b|)")] + [InlineData(@"(?((?!a))b)", @"(?((?!a))b|)")] // Alternation reduction [InlineData("a|b", "[ab]")] [InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")] @@ -313,6 +325,15 @@ public class RegexReductionTests [InlineData("(?:a|)a", "a{1,2}")] [InlineData("(?:a|)a*", "a*")] [InlineData("a+(?:a|)", "a+")] + [InlineData(@"ab*(?=c)", @"a(?>b*)(?=c)")] + [InlineData(@"\d+(?=\D)", @"(?>\d+)(?=\D)")] + [InlineData(@"\D+(?=\d)", @"(?>\D+)(?=\d)")] + [InlineData(@"\s+(?=\S)", @"(?>\s+)(?=\S)")] + [InlineData(@"\S+(?=\s)", @"(?>\S+)(?=\s)")] + [InlineData(@"[^\n]+(?=\n)", @"(?>[^\n]+)(?=\n)")] + [InlineData(@"[a-f]+(?=[^a-f])", @"(?>[a-f]+)(?=[^a-f])")] + [InlineData(@"[0-9]*(?=[^0-9])", @"(?>[0-9]*)(?=[^0-9])")] + [InlineData(@"a*(?=b)(?=bc)", @"(?>a*)(?=b)(?=bc)")] // [InlineData("abcde|abcdef", "abcde(?>|f)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree [InlineData("abcdef|abcde", "abcde(?>f|)")] [InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")] @@ -575,6 +596,17 @@ public void PatternsReduceIdentically(string actual, string expected) [InlineData("(abc?)*?d", "(?>(ab(?>c?))*)d")] [InlineData("(aba)+d", "(?>(aba)+)d")] [InlineData("(abc*)*d", "(?>(ab(?>c*))*)d")] + [InlineData(@"a*?(?=b)", @"(?>a*?)(?=b)")] + [InlineData(@"a+?(?=b)", @"(?>a+?)(?=b)")] + [InlineData(@"a{1,3}?(?=b)", @"(?>a{1,3}?)(?=b)")] + [InlineData(@"[ab]*(?=a)", @"(?>[ab]*)(?=a)")] + [InlineData(@"\w*(?=\b)", @"(?>\w*)(?=\b)")] + [InlineData(@".*(?=\b)", @"(?>.*)(?=\b)")] + [InlineData(@".*(?=^)", @"(?>.*)(?=^)")] + [InlineData(@"a*(?<=a)", @"(?>a*)(?<=a)")] + [InlineData(@"a+(?<=a)", @"(?>a+)(?<=a)")] + [InlineData(@"\d*(?<=\d)", @"(?>\d*)(?<=\d)")] + [InlineData(@"[ab]*?(?!a)", @"(?>[ab]*?)(?!a)")] // Lookaround reduction [InlineData("(?=(abc))", "(?=abc)")] [InlineData("(?=a(b*)c)", "(?=ab*c)")] @@ -583,6 +615,15 @@ public void PatternsReduceIdentically(string actual, string expected) [InlineData(@"a*(?!b)b", @"(?>a*)(?!b)b")] [InlineData(@"a*(?a*)(?a*)(?<=b)cde")] + [InlineData(@"a*(?=)a", @"(?>a*)(?=)a")] + [InlineData(@"(?<=(ab))", @"(?<=ab)")] + [InlineData(@"(?=(ab|ac))", @"(?=a[bc])")] + [InlineData(@"(?<=(ab|ac))", @"(?<=a[bc])")] + [InlineData(@"(?=ab)|ac", @"a(?=b|c)")] + [InlineData(@"(?=ab)c", @"a(?=bc)")] + [InlineData(@"(?!ab)c", @"a(?!bc)")] + [InlineData(@"(?=a)(?=b)", @"(?=ab)")] + [InlineData(@"(?!ab)(?!ac)", @"(?!a[bc])")] // Loops inside alternation constructs [InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")] [InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]