Skip to content

Commit a0e3cfc

Browse files
committed
Revert "Revert product changes for find_end vectorization (microsoft#5041)"
This reverts commit ca1553d.
1 parent ca1553d commit a0e3cfc

File tree

2 files changed

+313
-0
lines changed

2 files changed

+313
-0
lines changed

stl/inc/algorithm

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ const void* __stdcall __std_find_last_trivial_2(const void* _First, const void*
5959
const void* __stdcall __std_find_last_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept;
6060
const void* __stdcall __std_find_last_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept;
6161

62+
const void* __stdcall __std_find_end_1(
63+
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
64+
const void* __stdcall __std_find_end_2(
65+
const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept;
66+
6267
__declspec(noalias) _Min_max_1i __stdcall __std_minmax_1i(const void* _First, const void* _Last) noexcept;
6368
__declspec(noalias) _Min_max_1u __stdcall __std_minmax_1u(const void* _First, const void* _Last) noexcept;
6469
__declspec(noalias) _Min_max_2i __stdcall __std_minmax_2i(const void* _First, const void* _Last) noexcept;
@@ -189,6 +194,19 @@ _Ty* _Find_last_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val
189194
}
190195
}
191196

197+
template <class _Ty1, class _Ty2>
198+
_Ty1* _Find_end_vectorized(
199+
_Ty1* const _First1, _Ty1* const _Last1, _Ty2* const _First2, const size_t _Count2) noexcept {
200+
_STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2));
201+
if constexpr (sizeof(_Ty1) == 1) {
202+
return const_cast<_Ty1*>(static_cast<const _Ty1*>(::__std_find_end_1(_First1, _Last1, _First2, _Count2)));
203+
} else if constexpr (sizeof(_Ty1) == 2) {
204+
return const_cast<_Ty1*>(static_cast<const _Ty1*>(::__std_find_end_2(_First1, _Last1, _First2, _Count2)));
205+
} else {
206+
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
207+
}
208+
}
209+
192210
template <class _Ty, class _TVal1, class _TVal2>
193211
__declspec(noalias) void _Replace_vectorized(
194212
_Ty* const _First, _Ty* const _Last, const _TVal1 _Old_val, const _TVal2 _New_val) noexcept {
@@ -3194,6 +3212,26 @@ _NODISCARD _CONSTEXPR20 _FwdIt1 find_end(
31943212
if constexpr (_Is_ranges_random_iter_v<_FwdIt1> && _Is_ranges_random_iter_v<_FwdIt2>) {
31953213
const _Iter_diff_t<_FwdIt2> _Count2 = _ULast2 - _UFirst2;
31963214
if (_Count2 > 0 && _Count2 <= _ULast1 - _UFirst1) {
3215+
#if _USE_STD_VECTOR_ALGORITHMS
3216+
if constexpr (_Vector_alg_in_search_is_safe<decltype(_UFirst1), decltype(_UFirst2), _Pr>) {
3217+
if (!_STD _Is_constant_evaluated()) {
3218+
const auto _Ptr1 = _STD _To_address(_UFirst1);
3219+
3220+
const auto _Ptr_res1 = _STD _Find_end_vectorized(
3221+
_Ptr1, _STD _To_address(_ULast1), _STD _To_address(_UFirst2), static_cast<size_t>(_Count2));
3222+
3223+
if constexpr (is_pointer_v<decltype(_UFirst1)>) {
3224+
_UFirst1 = _Ptr_res1;
3225+
} else {
3226+
_UFirst1 += _Ptr_res1 - _Ptr1;
3227+
}
3228+
3229+
_STD _Seek_wrapped(_First1, _UFirst1);
3230+
return _First1;
3231+
}
3232+
}
3233+
#endif // _USE_STD_VECTOR_ALGORITHMS
3234+
31973235
for (auto _UCandidate = _ULast1 - static_cast<_Iter_diff_t<_FwdIt1>>(_Count2);; --_UCandidate) {
31983236
if (_STD _Equal_rev_pred_unchecked(_UCandidate, _UFirst2, _ULast2, _STD _Pass_fn(_Pred))) {
31993237
_STD _Seek_wrapped(_First1, _UCandidate);
@@ -3297,6 +3335,34 @@ namespace ranges {
32973335

32983336
if (_Count2 > 0 && _Count2 <= _Count1) {
32993337
const auto _Count2_as1 = static_cast<iter_difference_t<_It1>>(_Count2);
3338+
#if _USE_STD_VECTOR_ALGORITHMS
3339+
if constexpr (_Vector_alg_in_search_is_safe<_It1, _It2, _Pr> && is_same_v<_Pj1, identity>
3340+
&& is_same_v<_Pj2, identity>) {
3341+
if (!_STD is_constant_evaluated()) {
3342+
const auto _Ptr1 = _STD to_address(_First1);
3343+
const auto _Ptr2 = _STD to_address(_First2);
3344+
const auto _Ptr_last1 = _Ptr1 + _Count1;
3345+
3346+
const auto _Ptr_res1 =
3347+
_STD _Find_end_vectorized(_Ptr1, _Ptr_last1, _Ptr2, static_cast<size_t>(_Count2));
3348+
3349+
if constexpr (is_pointer_v<_It1>) {
3350+
if (_Ptr_res1 != _Ptr_last1) {
3351+
return {_Ptr_res1, _Ptr_res1 + _Count2};
3352+
} else {
3353+
return {_Ptr_res1, _Ptr_res1};
3354+
}
3355+
} else {
3356+
_First1 += _Ptr_res1 - _Ptr1;
3357+
if (_Ptr_res1 != _Ptr_last1) {
3358+
return {_First1, _First1 + _Count2_as1};
3359+
} else {
3360+
return {_First1, _First1};
3361+
}
3362+
}
3363+
}
3364+
}
3365+
#endif // _USE_STD_VECTOR_ALGORITHMS
33003366

33013367
for (auto _Candidate = _First1 + (_Count1 - _Count2_as1);; --_Candidate) {
33023368
auto _Match_and_mid1 =

stl/src/vector_algorithms.cpp

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3633,6 +3633,243 @@ namespace {
36333633
return _Last1;
36343634
}
36353635
}
3636+
3637+
template <class _Traits, class _Ty>
3638+
const void* __stdcall __std_find_end_impl(
3639+
const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept {
3640+
if (_Count2 == 0) {
3641+
return _Last1;
3642+
}
3643+
3644+
if (_Count2 == 1) {
3645+
return __std_find_last_trivial_impl<_Traits>(_First1, _Last1, *static_cast<const _Ty*>(_First2));
3646+
}
3647+
3648+
const size_t _Size_bytes_1 = _Byte_length(_First1, _Last1);
3649+
const size_t _Size_bytes_2 = _Count2 * sizeof(_Ty);
3650+
3651+
if (_Size_bytes_1 < _Size_bytes_2) {
3652+
return _Last1;
3653+
}
3654+
3655+
#ifndef _M_ARM64EC
3656+
if (_Use_sse42() && _Size_bytes_1 >= 16) {
3657+
constexpr int _Op = (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ORDERED;
3658+
constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8;
3659+
3660+
static constexpr int8_t _Low_part_mask[] = {//
3661+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, //
3662+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3663+
3664+
if (_Size_bytes_2 <= 16) {
3665+
const int _Size_el_2 = static_cast<int>(_Count2);
3666+
constexpr unsigned int _Whole_mask = (1 << _Part_size_el) - 1;
3667+
const unsigned int _Needle_fit_mask = (1 << (_Part_size_el - _Size_el_2 + 1)) - 1;
3668+
const unsigned int _Needle_unfit_mask = _Whole_mask ^ _Needle_fit_mask;
3669+
3670+
const void* _Stop1 = _First1;
3671+
_Advance_bytes(_Stop1, _Size_bytes_1 & 0xF);
3672+
3673+
alignas(16) uint8_t _Tmp2[16];
3674+
memcpy(_Tmp2, _First2, _Size_bytes_2);
3675+
const __m128i _Data2 = _mm_load_si128(reinterpret_cast<const __m128i*>(_Tmp2));
3676+
3677+
const void* _Mid1 = _Last1;
3678+
_Rewind_bytes(_Mid1, 16);
3679+
3680+
const auto _Check_fit = [&_Mid1, _Needle_fit_mask](const unsigned int _Match) noexcept {
3681+
const unsigned int _Fit_match = _Match & _Needle_fit_mask;
3682+
if (_Fit_match != 0) {
3683+
unsigned long _Match_last_pos;
3684+
3685+
// CodeQL [SM02313] Result is always initialized: we just tested that _Fit_match is non-zero.
3686+
_BitScanReverse(&_Match_last_pos, _Fit_match);
3687+
3688+
_Advance_bytes(_Mid1, _Match_last_pos * sizeof(_Ty));
3689+
return true;
3690+
}
3691+
3692+
return false;
3693+
};
3694+
3695+
#pragma warning(push)
3696+
#pragma warning(disable : 4324) // structure was padded due to alignment specifier
3697+
const auto _Check_unfit = [=, &_Mid1](const unsigned int _Match) noexcept {
3698+
long _Unfit_match = _Match & _Needle_unfit_mask;
3699+
while (_Unfit_match != 0) {
3700+
const void* _Tmp1 = _Mid1;
3701+
unsigned long _Match_last_pos;
3702+
3703+
// CodeQL [SM02313] Result is always initialized: we just tested that _Unfit_match is non-zero.
3704+
_BitScanReverse(&_Match_last_pos, _Unfit_match);
3705+
3706+
_Advance_bytes(_Tmp1, _Match_last_pos * sizeof(_Ty));
3707+
3708+
const __m128i _Match_data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Tmp1));
3709+
const __m128i _Cmp_result = _mm_xor_si128(_Data2, _Match_data);
3710+
const __m128i _Data_mask =
3711+
_mm_loadu_si128(reinterpret_cast<const __m128i*>(_Low_part_mask + 16 - _Size_bytes_2));
3712+
3713+
if (_mm_testz_si128(_Cmp_result, _Data_mask)) {
3714+
_Mid1 = _Tmp1;
3715+
return true;
3716+
}
3717+
3718+
_bittestandreset(&_Unfit_match, _Match_last_pos);
3719+
}
3720+
3721+
return false;
3722+
};
3723+
#pragma warning(pop)
3724+
3725+
// TRANSITION, DevCom-10689455, the code below could test with _mm_cmpestrc,
3726+
// if it has been fused with _mm_cmpestrm.
3727+
3728+
// The very last part, for any match needle should fit, otherwise false match
3729+
__m128i _Data1_last = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Mid1));
3730+
const auto _Match_last = _mm_cmpestrm(_Data2, _Size_el_2, _Data1_last, _Part_size_el, _Op);
3731+
const unsigned int _Match_last_val = _mm_cvtsi128_si32(_Match_last);
3732+
if (_Check_fit(_Match_last_val)) {
3733+
return _Mid1;
3734+
}
3735+
3736+
// The middle part, fit and unfit needle
3737+
while (_Mid1 != _Stop1) {
3738+
_Rewind_bytes(_Mid1, 16);
3739+
const __m128i _Data1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Mid1));
3740+
const auto _Match = _mm_cmpestrm(_Data2, _Size_el_2, _Data1, _Part_size_el, _Op);
3741+
const unsigned int _Match_val = _mm_cvtsi128_si32(_Match);
3742+
if (_Match_val != 0 && (_Check_unfit(_Match_val) || _Check_fit(_Match_val))) {
3743+
return _Mid1;
3744+
}
3745+
}
3746+
3747+
// The first part, fit and unfit needle, mask out already processed positions
3748+
if (const size_t _Tail_bytes_1 = _Size_bytes_1 & 0xF; _Tail_bytes_1 != 0) {
3749+
_Mid1 = _First1;
3750+
const __m128i _Data1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Mid1));
3751+
const auto _Match = _mm_cmpestrm(_Data2, _Size_el_2, _Data1, _Part_size_el, _Op);
3752+
const unsigned int _Match_val = _mm_cvtsi128_si32(_Match) & ((1 << _Tail_bytes_1) - 1);
3753+
if (_Match_val != 0 && (_Check_unfit(_Match_val) || _Check_fit(_Match_val))) {
3754+
return _Mid1;
3755+
}
3756+
}
3757+
3758+
return _Last1;
3759+
} else {
3760+
const __m128i _Data2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_First2));
3761+
3762+
const void* _Tail2 = _First2;
3763+
_Advance_bytes(_Tail2, 16);
3764+
3765+
const void* _Mid1 = _Last1;
3766+
_Rewind_bytes(_Mid1, _Size_bytes_2);
3767+
3768+
const size_t _Size_diff_bytes = _Size_bytes_1 - _Size_bytes_2;
3769+
const void* _Stop1 = _First1;
3770+
_Advance_bytes(_Stop1, _Size_diff_bytes & 0xF);
3771+
3772+
#pragma warning(push)
3773+
#pragma warning(disable : 4324) // structure was padded due to alignment specifier
3774+
const auto _Check = [=, &_Mid1](long _Match) noexcept {
3775+
while (_Match != 0) {
3776+
const void* _Tmp1 = _Mid1;
3777+
unsigned long _Match_last_pos;
3778+
3779+
// CodeQL [SM02313] Result is always initialized: we just tested that _Match is non-zero.
3780+
_BitScanReverse(&_Match_last_pos, _Match);
3781+
3782+
bool _Match_1st_16 = true;
3783+
3784+
if (_Match_last_pos != 0) {
3785+
_Advance_bytes(_Tmp1, _Match_last_pos * sizeof(_Ty));
3786+
3787+
const __m128i _Match_data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Tmp1));
3788+
const __m128i _Cmp_result = _mm_xor_si128(_Data2, _Match_data);
3789+
3790+
if (!_mm_testz_si128(_Cmp_result, _Cmp_result)) {
3791+
_Match_1st_16 = false;
3792+
}
3793+
}
3794+
3795+
if (_Match_1st_16) {
3796+
const void* _Tail1 = _Tmp1;
3797+
_Advance_bytes(_Tail1, 16);
3798+
3799+
if (memcmp(_Tail1, _Tail2, _Size_bytes_2 - 16) == 0) {
3800+
_Mid1 = _Tmp1;
3801+
return true;
3802+
}
3803+
}
3804+
3805+
_bittestandreset(&_Match, _Match_last_pos);
3806+
}
3807+
3808+
return false;
3809+
};
3810+
#pragma warning(pop)
3811+
// TRANSITION, DevCom-10689455, the code below could test with _mm_cmpestrc,
3812+
// if it has been fused with _mm_cmpestrm.
3813+
3814+
// The main part, match all characters
3815+
for (;;) {
3816+
const __m128i _Data1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Mid1));
3817+
const auto _Match = _mm_cmpestrm(_Data2, _Part_size_el, _Data1, _Part_size_el, _Op);
3818+
const unsigned int _Match_val = _mm_cvtsi128_si32(_Match);
3819+
if (_Match_val != 0 && _Check(_Match_val)) {
3820+
return _Mid1;
3821+
}
3822+
3823+
if (_Mid1 == _Stop1) {
3824+
break;
3825+
}
3826+
3827+
_Rewind_bytes(_Mid1, 16);
3828+
}
3829+
3830+
// The first part, mask out already processed positions
3831+
if (const size_t _Tail_bytes_1 = _Size_diff_bytes & 0xF; _Tail_bytes_1 != 0) {
3832+
_Mid1 = _First1;
3833+
const __m128i _Data1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Mid1));
3834+
const auto _Match = _mm_cmpestrm(_Data2, _Part_size_el, _Data1, _Part_size_el, _Op);
3835+
const unsigned int _Match_val = _mm_cvtsi128_si32(_Match) & ((1 << _Tail_bytes_1) - 1);
3836+
if (_Match_val != 0 && _Check(_Match_val)) {
3837+
return _Mid1;
3838+
}
3839+
}
3840+
3841+
return _Last1;
3842+
}
3843+
} else
3844+
#endif // !defined(_M_ARM64EC)
3845+
{
3846+
auto _Ptr1 = static_cast<const _Ty*>(_Last1) - _Count2;
3847+
const auto _Ptr2 = static_cast<const _Ty*>(_First2);
3848+
3849+
for (;;) {
3850+
if (*_Ptr1 == *_Ptr2) {
3851+
bool _Equal = true;
3852+
3853+
for (size_t _Idx = 1; _Idx != _Count2; ++_Idx) {
3854+
if (_Ptr1[_Idx] != _Ptr2[_Idx]) {
3855+
_Equal = false;
3856+
break;
3857+
}
3858+
}
3859+
3860+
if (_Equal) {
3861+
return _Ptr1;
3862+
}
3863+
}
3864+
3865+
if (_Ptr1 == _First1) {
3866+
return _Last1;
3867+
}
3868+
3869+
--_Ptr1;
3870+
}
3871+
}
3872+
}
36363873
} // unnamed namespace
36373874

36383875
extern "C" {
@@ -3757,6 +3994,16 @@ const void* __stdcall __std_search_2(
37573994
return __std_search_impl<_Find_traits_2, uint16_t>(_First1, _Last1, _First2, _Count2);
37583995
}
37593996

3997+
const void* __stdcall __std_find_end_1(
3998+
const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept {
3999+
return __std_find_end_impl<_Find_traits_1, uint8_t>(_First1, _Last1, _First2, _Count2);
4000+
}
4001+
4002+
const void* __stdcall __std_find_end_2(
4003+
const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept {
4004+
return __std_find_end_impl<_Find_traits_2, uint16_t>(_First1, _Last1, _First2, _Count2);
4005+
}
4006+
37604007
__declspec(noalias) size_t __stdcall __std_mismatch_1(
37614008
const void* const _First1, const void* const _First2, const size_t _Count) noexcept {
37624009
return __std_mismatch_impl<_Find_traits_1, uint8_t>(_First1, _First2, _Count);

0 commit comments

Comments
 (0)