Skip to content

Commit 90424b8

Browse files
1 parent 3551f8b commit 90424b8

File tree

2 files changed

+71
-11
lines changed

2 files changed

+71
-11
lines changed

stl/inc/regex

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,16 +1202,17 @@ _INLINE_VAR constexpr unsigned int _Bmp_size = (_Bmp_max + _Bmp_chrs - 1U) / _B
12021202
_INLINE_VAR constexpr unsigned int _ARRAY_THRESHOLD = 4U;
12031203

12041204
enum _Node_flags : int { // flags for nfa nodes with special properties
1205-
_Fl_none = 0x000,
1206-
_Fl_negate = 0x001,
1207-
_Fl_greedy = 0x002,
1208-
_Fl_longest = 0x008, // TRANSITION, ABI: 0x004 is unused; the parser previously marked some nodes with it
1209-
_Fl_class_negated_w = 0x100,
1210-
_Fl_class_negated_s = 0x200,
1211-
_Fl_class_negated_d = 0x400,
1212-
_Fl_begin_needs_w = 0x100,
1213-
_Fl_begin_needs_s = 0x200,
1214-
_Fl_begin_needs_d = 0x400
1205+
_Fl_none = 0x000,
1206+
_Fl_negate = 0x001,
1207+
_Fl_greedy = 0x002,
1208+
_Fl_longest = 0x008, // TRANSITION, ABI: 0x004 is unused; the parser previously marked some nodes with it
1209+
_Fl_class_negated_w = 0x100,
1210+
_Fl_class_negated_s = 0x200,
1211+
_Fl_class_negated_d = 0x400,
1212+
_Fl_class_cl_all_bits = 0x800, // TRANSITION, ABI: GH-5242
1213+
_Fl_begin_needs_w = 0x100,
1214+
_Fl_begin_needs_s = 0x200,
1215+
_Fl_begin_needs_d = 0x400
12151216
};
12161217

12171218
_BITMASK_OPS(_EMPTY_ARGUMENT, _Node_flags)
@@ -2986,11 +2987,19 @@ template <class _FwdIt, class _Elem, class _RxTraits>
29862987
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_named_class(
29872988
typename _RxTraits::char_class_type _Cl, const _Rx_char_class_kind _Kind) {
29882989
// add contents of named class to bracket expression
2990+
using _Char_class_type = typename _RxTraits::char_class_type;
29892991
_Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Current);
29902992
_Add_elts(_Node, _Cl, _Kind != _Rx_char_class_kind::_Positive);
29912993
if (_Bmp_max <= _STD _Max_limit<typename _RxTraits::_Uelem>()) {
29922994
if (_Kind == _Rx_char_class_kind::_Positive) {
2993-
_Node->_Classes = static_cast<typename _RxTraits::char_class_type>(_Node->_Classes | _Cl);
2995+
auto _Cl_all_bits_set = static_cast<_Char_class_type>(-1);
2996+
if ((_Node->_Classes != _Cl_all_bits_set && _Cl != _Cl_all_bits_set)
2997+
|| _Node->_Classes == _Char_class_type{}) {
2998+
_Node->_Classes = static_cast<_Char_class_type>(_Node->_Classes | _Cl);
2999+
} else if (_Node->_Classes != _Cl) {
3000+
_Node->_Classes = static_cast<_Char_class_type>(_Node->_Classes & _Cl);
3001+
_Node->_Flags |= _Fl_class_cl_all_bits;
3002+
}
29943003
} else {
29953004
auto _Node_flag = static_cast<_Node_flags>(_Kind);
29963005
_Node->_Flags |= _Node_flag;
@@ -3529,6 +3538,9 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap
35293538
_Found = true;
35303539
} else if (_Node->_Classes != typename _RxTraits::char_class_type{} && _Traits.isctype(_Ch, _Node->_Classes)) {
35313540
_Found = true;
3541+
} else if ((_Node->_Flags & _Fl_class_cl_all_bits)
3542+
&& _Traits.isctype(_Ch, static_cast<typename _RxTraits::char_class_type>(-1))) {
3543+
_Found = true;
35323544
} else if (_Node->_Equiv && _STD _Lookup_equiv2(_Ch, _Node->_Equiv, _Traits)) {
35333545
_Found = true;
35343546
} else if ((_Node->_Flags & _Fl_class_negated_w)
@@ -3905,6 +3917,9 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt
39053917
} else if (_Node->_Classes != typename _RxTraits::char_class_type{}
39063918
&& _Traits.isctype(_Ch, _Node->_Classes)) {
39073919
_Found = true;
3920+
} else if ((_Node->_Flags & _Fl_class_cl_all_bits)
3921+
&& _Traits.isctype(_Ch, static_cast<typename _RxTraits::char_class_type>(-1))) {
3922+
_Found = true;
39083923
} else if (_Node->_Equiv && _STD _Lookup_equiv2(_Ch, _Node->_Equiv, _Traits)) {
39093924
_Found = true;
39103925
} else if ((_Node->_Flags & _Fl_class_negated_w)

tests/std/tests/VSO_0000000_regex_use/test.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,6 +1275,50 @@ void test_gh_5214() {
12751275
}
12761276
}
12771277

1278+
void test_gh_5243() {
1279+
// GH-5243: <regex>: wregex with regular expression [\w\s] fails to match some spaces
1280+
for (wstring pattern : {LR"([\w])", LR"([\w\w])"}) {
1281+
const test_wregex word_regex(&g_regexTester, pattern);
1282+
word_regex.should_search_match(L"a", L"a");
1283+
word_regex.should_search_match(L"2", L"2");
1284+
word_regex.should_search_match(L"_", L"_");
1285+
word_regex.should_search_match(L"\u00e4", L"\u00e4"); // U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
1286+
word_regex.should_search_match(L"\u0662", L"\u0662"); // U+0662 ARABIC-INDIC DIGIT TWO
1287+
word_regex.should_search_fail(L" ");
1288+
word_regex.should_search_fail(L"\u2028"); // U+2028 LINE SEPARATOR
1289+
word_regex.should_search_fail(L".");
1290+
word_regex.should_search_fail(L"-");
1291+
word_regex.should_search_fail(L"\u203d"); // U+203D INTERROBANG
1292+
}
1293+
{
1294+
const test_wregex space_regex(&g_regexTester, LR"([\s])");
1295+
space_regex.should_search_fail(L"a");
1296+
space_regex.should_search_fail(L"2");
1297+
space_regex.should_search_fail(L"_");
1298+
space_regex.should_search_fail(L"\u00e4"); // U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
1299+
space_regex.should_search_fail(L"\u0662"); // U+0662 ARABIC-INDIC DIGIT TWO
1300+
space_regex.should_search_match(L" ", L" ");
1301+
space_regex.should_search_match(L"\u2028", L"\u2028"); // U+2028 LINE SEPARATOR
1302+
space_regex.should_search_fail(L".");
1303+
space_regex.should_search_fail(L"-");
1304+
space_regex.should_search_fail(L"\u203d"); // U+203D INTERROBANG
1305+
}
1306+
for (wstring pattern : {LR"([\w\s])", LR"([\s\w])"}) {
1307+
const test_wregex word_or_space_regex(&g_regexTester, pattern);
1308+
word_or_space_regex.should_search_match(L"a", L"a");
1309+
word_or_space_regex.should_search_match(L"2", L"2");
1310+
word_or_space_regex.should_search_match(L"_", L"_");
1311+
word_or_space_regex.should_search_match(L"\u00e4", L"\u00e4"); // U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
1312+
word_or_space_regex.should_search_match(L"\u0662", L"\u0662"); // U+0662 ARABIC-INDIC DIGIT TWO
1313+
word_or_space_regex.should_search_match(L" ", L" ");
1314+
word_or_space_regex.should_search_match(L"\u2028", L"\u2028"); // U+2028 LINE SEPARATOR
1315+
word_or_space_regex.should_search_fail(L".");
1316+
word_or_space_regex.should_search_fail(L"-");
1317+
word_or_space_regex.should_search_fail(L"\u203d"); // U+203D INTERROBANG
1318+
}
1319+
}
1320+
1321+
12781322
void test_gh_5245() {
12791323
// GH-5245: <regex>: Successful negative lookahead assertions
12801324
// sometimes mistakenly assign matches to capture groups
@@ -1660,6 +1704,7 @@ int main() {
16601704
test_gh_5167();
16611705
test_gh_5192();
16621706
test_gh_5214();
1707+
test_gh_5243();
16631708
test_gh_5245();
16641709
test_gh_5253();
16651710
test_gh_5362();

0 commit comments

Comments
 (0)