Skip to content

Commit 3551f8b

Browse files
1 parent a87163c commit 3551f8b

File tree

2 files changed

+73
-41
lines changed
  • stl/inc
  • tests/std/tests/GH_005204_regex_collating_ranges

2 files changed

+73
-41
lines changed

stl/inc/regex

Lines changed: 45 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1537,7 +1537,7 @@ public:
15371537
void _Add_char2(_Elem _Ch);
15381538
void _Add_class();
15391539
void _Add_char_to_class(_Elem _Ch);
1540-
void _Add_range2(_Elem, _Elem);
1540+
void _Add_range3(_Elem, _Elem);
15411541
void _Add_named_class(typename _RxTraits::char_class_type, _Rx_char_class_kind);
15421542
void _Add_equiv2(const _Elem*, const _Elem*);
15431543
void _Add_coll2(const _Elem*, const _Elem*);
@@ -1567,11 +1567,8 @@ private:
15671567
_Node_base* _Current;
15681568
regex_constants::syntax_option_type _Flags;
15691569
const _RxTraits& _Traits;
1570-
const int _Bmax; // Do not use; use _Get_bmax instead.
1571-
const int _Tmax; // Do not use; use _Get_tmax instead.
1572-
1573-
unsigned int _Get_bmax() const;
1574-
unsigned int _Get_tmax() const;
1570+
const int _Bmax; // TRANSITION, ABI: preserved for binary compatibility
1571+
const int _Tmax; // TRANSITION, ABI: preserved for binary compatibility
15751572

15761573
public:
15771574
_Builder& operator=(const _Builder&) = delete;
@@ -2911,33 +2908,61 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_char_to_class(_Elem _Ch) { // add
29112908
}
29122909

29132910
template <class _FwdIt, class _Elem, class _RxTraits>
2914-
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_range2(const _Elem _Arg0, const _Elem _Arg1) {
2911+
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_range3(const _Elem _Arg0, const _Elem _Arg1) {
29152912
// add character range to set
2913+
using string_type = typename _RxTraits::string_type;
29162914
unsigned int _Ex0 = static_cast<typename _RxTraits::_Uelem>(_Arg0);
29172915
const unsigned int _Ex1 = static_cast<typename _RxTraits::_Uelem>(_Arg1);
29182916
_Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Current);
29192917

2920-
for (; _Ex0 <= _Ex1 && _Ex1 < _Get_bmax(); ++_Ex0) { // set a bit
2921-
if (!_Node->_Small) {
2918+
// set bits and check that the range is non-empty
2919+
if (_Flags & regex_constants::collate) {
2920+
_Elem _Ch;
2921+
const auto _Ch_ptr = _STD addressof(_Ch);
2922+
const auto _Arg0_ptr = _STD addressof(_Arg0);
2923+
const auto _Arg1_ptr = _STD addressof(_Arg1);
2924+
const string_type _Arg0_key = _Traits.transform(_Arg0_ptr, _Arg0_ptr + 1);
2925+
const string_type _Arg1_key = _Traits.transform(_Arg1_ptr, _Arg1_ptr + 1);
2926+
2927+
if (_Arg0_key > _Arg1_key) {
2928+
_Xregex_error(regex_constants::error_range);
2929+
}
2930+
2931+
for (unsigned int _UCh = 0; _UCh < _Bmp_max; ++_UCh) {
2932+
_Ch = static_cast<_Elem>(_UCh);
2933+
const string_type _Ch_key = _Traits.transform(_Ch_ptr, _Ch_ptr + 1);
2934+
if (_Arg0_key <= _Ch_key && _Ch_key <= _Arg1_key) {
2935+
if (!_Node->_Small) {
2936+
_Node->_Small = new _Bitmap;
2937+
}
2938+
_Node->_Small->_Mark(_UCh);
2939+
}
2940+
}
2941+
} else if (_Ex0 > _Ex1) {
2942+
_Xregex_error(regex_constants::error_range);
2943+
} else {
2944+
if (!_Node->_Small && _Ex0 < _Bmp_max) {
29222945
_Node->_Small = new _Bitmap;
29232946
}
29242947

2925-
_Node->_Small->_Mark(_Ex0);
2926-
}
2948+
for (; _Ex0 <= _Ex1 && _Ex0 < _Bmp_max; ++_Ex0) {
2949+
_Node->_Small->_Mark(_Ex0);
2950+
}
29272951

2928-
if ((_Flags & regex_constants::collate) || _Ex1 >= _Ex0) {
2929-
if (_Ex1 - _Ex0 < _Get_tmax()) {
2952+
if (_Ex1 - _Ex0 < _ARRAY_THRESHOLD) {
29302953
for (; _Ex0 <= _Ex1; ++_Ex0) {
29312954
_Add_char_to_array(static_cast<_Elem>(_Ex0));
29322955
}
2933-
} else { // store remaining range as pair
2934-
if (!_Node->_Ranges) {
2935-
_Node->_Ranges = new _Buf<_Elem>;
2936-
}
2956+
}
2957+
}
29372958

2938-
_Node->_Ranges->_Insert2(static_cast<_Elem>(_Ex0));
2939-
_Node->_Ranges->_Insert2(_Arg1);
2959+
if ((_Flags & regex_constants::collate) || _Ex1 >= _Ex0) { // store remaining range as pair
2960+
if (!_Node->_Ranges) {
2961+
_Node->_Ranges = new _Buf<_Elem>;
29402962
}
2963+
2964+
_Node->_Ranges->_Insert2(static_cast<_Elem>(_Ex0));
2965+
_Node->_Ranges->_Insert2(_Arg1);
29412966
}
29422967
}
29432968

@@ -2991,16 +3016,6 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Char_to_elts2(const _Elem* const _Firs
29913016
(*_Cur)->_Data._Insert2(_First, _Last);
29923017
}
29933018

2994-
template <class _FwdIt, class _Elem, class _RxTraits>
2995-
unsigned int _Builder<_FwdIt, _Elem, _RxTraits>::_Get_bmax() const {
2996-
return static_cast<unsigned int>(_Bmax);
2997-
}
2998-
2999-
template <class _FwdIt, class _Elem, class _RxTraits>
3000-
unsigned int _Builder<_FwdIt, _Elem, _RxTraits>::_Get_tmax() const {
3001-
return static_cast<unsigned int>(_Tmax);
3002-
}
3003-
30043019
template <class _FwdIt, class _Elem, class _RxTraits>
30053020
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_equiv2(const _Elem* const _First, const _Elem* const _Last) {
30063021
// add elements of equivalence class to bracket expression
@@ -4328,18 +4343,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_ClassRanges() { // check for valid clas
43284343
_Chr2 = _Traits.translate(_Chr2);
43294344
}
43304345

4331-
if (_Flags & regex_constants::collate) {
4332-
const _Elem* const _Chr1_ptr = _STD addressof(_Chr1);
4333-
const _Elem* const _Chr2_ptr = _STD addressof(_Chr2);
4334-
if (_Traits.transform(_Chr2_ptr, _Chr2_ptr + 1) < _Traits.transform(_Chr1_ptr, _Chr1_ptr + 1)) {
4335-
_Error(regex_constants::error_range);
4336-
}
4337-
} else if (static_cast<typename _RxTraits::_Uelem>(_Chr2)
4338-
< static_cast<typename _RxTraits::_Uelem>(_Chr1)) {
4339-
_Error(regex_constants::error_range);
4340-
}
4341-
4342-
_Nfa._Add_range2(_Chr1, _Chr2);
4346+
_Nfa._Add_range3(_Chr1, _Chr2);
43434347
} else if (_Ret == _Prs_chr) {
43444348
_Nfa._Add_char_to_class(static_cast<_Elem>(_Val));
43454349
}

tests/std/tests/GH_005204_regex_collating_ranges/test.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,9 +562,37 @@ void test_gh_994() {
562562
#endif // !defined(SKIP_COLLATE_TESTS)
563563
}
564564

565+
void test_gh_5437_ECMAScript_or_collate(syntax_option_type ECMAScript_or_collate) {
566+
{
567+
test_wregex char_range(&g_regexTester, L"^[\u0001-\u0200]$", ECMAScript_or_collate);
568+
for (wchar_t ch = L'\u0001'; ch <= L'\u0200'; ++ch) {
569+
char_range.should_search_match(wstring(1, ch), wstring(1, ch));
570+
}
571+
char_range.should_search_fail(wstring(1, L'\u0000'));
572+
char_range.should_search_fail(wstring(1, L'\u0201'));
573+
}
574+
{
575+
test_wregex char_range(&g_regexTester, L"^[\u00FE-\u0100]$", ECMAScript_or_collate);
576+
for (wchar_t ch = L'\u00FE'; ch <= L'\u0100'; ++ch) {
577+
char_range.should_search_match(wstring(1, ch), wstring(1, ch));
578+
}
579+
char_range.should_search_fail(wstring(1, L'\u00FD'));
580+
char_range.should_search_fail(wstring(1, L'\u0101'));
581+
}
582+
}
583+
584+
void test_gh_5437() {
585+
// GH-5437: make `wregex` handle small character ranges containing U+00FF and U+0100 correctly
586+
test_gh_5437_ECMAScript_or_collate(ECMAScript);
587+
#ifndef SKIP_COLLATE_TESTS
588+
test_gh_5437_ECMAScript_or_collate(regex_constants::collate);
589+
#endif // !defined(SKIP_COLLATE_TESTS)
590+
}
591+
565592
int main() {
566593
test_collating_ranges_german();
567594
test_gh_994();
595+
test_gh_5437();
568596

569597
return g_regexTester.result();
570598
}

0 commit comments

Comments
 (0)