@@ -3633,6 +3633,243 @@ namespace {
3633
3633
return _Last1;
3634
3634
}
3635
3635
}
3636
+
3637
+ template <class _Traits , class _Ty >
3638
+ const void * __stdcall __std_find_end_impl (
3639
+ const void * const _First1, const void * const _Last1, const void * const _First2, const size_t _Count2) noexcept {
3640
+ if (_Count2 == 0 ) {
3641
+ return _Last1;
3642
+ }
3643
+
3644
+ if (_Count2 == 1 ) {
3645
+ return __std_find_last_trivial_impl<_Traits>(_First1, _Last1, *static_cast <const _Ty*>(_First2));
3646
+ }
3647
+
3648
+ const size_t _Size_bytes_1 = _Byte_length (_First1, _Last1);
3649
+ const size_t _Size_bytes_2 = _Count2 * sizeof (_Ty);
3650
+
3651
+ if (_Size_bytes_1 < _Size_bytes_2) {
3652
+ return _Last1;
3653
+ }
3654
+
3655
+ #ifndef _M_ARM64EC
3656
+ if (_Use_sse42 () && _Size_bytes_1 >= 16 ) {
3657
+ constexpr int _Op = (sizeof (_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ORDERED;
3658
+ constexpr int _Part_size_el = sizeof (_Ty) == 1 ? 16 : 8 ;
3659
+
3660
+ static constexpr int8_t _Low_part_mask[] = {//
3661
+ -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //
3662
+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
3663
+
3664
+ if (_Size_bytes_2 <= 16 ) {
3665
+ const int _Size_el_2 = static_cast <int >(_Count2);
3666
+ constexpr unsigned int _Whole_mask = (1 << _Part_size_el) - 1 ;
3667
+ const unsigned int _Needle_fit_mask = (1 << (_Part_size_el - _Size_el_2 + 1 )) - 1 ;
3668
+ const unsigned int _Needle_unfit_mask = _Whole_mask ^ _Needle_fit_mask;
3669
+
3670
+ const void * _Stop1 = _First1;
3671
+ _Advance_bytes (_Stop1, _Size_bytes_1 & 0xF );
3672
+
3673
+ alignas (16 ) uint8_t _Tmp2[16 ];
3674
+ memcpy (_Tmp2, _First2, _Size_bytes_2);
3675
+ const __m128i _Data2 = _mm_load_si128 (reinterpret_cast <const __m128i*>(_Tmp2));
3676
+
3677
+ const void * _Mid1 = _Last1;
3678
+ _Rewind_bytes (_Mid1, 16 );
3679
+
3680
+ const auto _Check_fit = [&_Mid1, _Needle_fit_mask](const unsigned int _Match) noexcept {
3681
+ const unsigned int _Fit_match = _Match & _Needle_fit_mask;
3682
+ if (_Fit_match != 0 ) {
3683
+ unsigned long _Match_last_pos;
3684
+
3685
+ // CodeQL [SM02313] Result is always initialized: we just tested that _Fit_match is non-zero.
3686
+ _BitScanReverse (&_Match_last_pos, _Fit_match);
3687
+
3688
+ _Advance_bytes (_Mid1, _Match_last_pos * sizeof (_Ty));
3689
+ return true ;
3690
+ }
3691
+
3692
+ return false ;
3693
+ };
3694
+
3695
+ #pragma warning(push)
3696
+ #pragma warning(disable : 4324) // structure was padded due to alignment specifier
3697
+ const auto _Check_unfit = [=, &_Mid1](const unsigned int _Match) noexcept {
3698
+ long _Unfit_match = _Match & _Needle_unfit_mask;
3699
+ while (_Unfit_match != 0 ) {
3700
+ const void * _Tmp1 = _Mid1;
3701
+ unsigned long _Match_last_pos;
3702
+
3703
+ // CodeQL [SM02313] Result is always initialized: we just tested that _Unfit_match is non-zero.
3704
+ _BitScanReverse (&_Match_last_pos, _Unfit_match);
3705
+
3706
+ _Advance_bytes (_Tmp1, _Match_last_pos * sizeof (_Ty));
3707
+
3708
+ const __m128i _Match_data = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(_Tmp1));
3709
+ const __m128i _Cmp_result = _mm_xor_si128 (_Data2, _Match_data);
3710
+ const __m128i _Data_mask =
3711
+ _mm_loadu_si128 (reinterpret_cast <const __m128i*>(_Low_part_mask + 16 - _Size_bytes_2));
3712
+
3713
+ if (_mm_testz_si128 (_Cmp_result, _Data_mask)) {
3714
+ _Mid1 = _Tmp1;
3715
+ return true ;
3716
+ }
3717
+
3718
+ _bittestandreset (&_Unfit_match, _Match_last_pos);
3719
+ }
3720
+
3721
+ return false ;
3722
+ };
3723
+ #pragma warning(pop)
3724
+
3725
+ // TRANSITION, DevCom-10689455, the code below could test with _mm_cmpestrc,
3726
+ // if it has been fused with _mm_cmpestrm.
3727
+
3728
+ // The very last part, for any match needle should fit, otherwise false match
3729
+ __m128i _Data1_last = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(_Mid1));
3730
+ const auto _Match_last = _mm_cmpestrm (_Data2, _Size_el_2, _Data1_last, _Part_size_el, _Op);
3731
+ const unsigned int _Match_last_val = _mm_cvtsi128_si32 (_Match_last);
3732
+ if (_Check_fit (_Match_last_val)) {
3733
+ return _Mid1;
3734
+ }
3735
+
3736
+ // The middle part, fit and unfit needle
3737
+ while (_Mid1 != _Stop1) {
3738
+ _Rewind_bytes (_Mid1, 16 );
3739
+ const __m128i _Data1 = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(_Mid1));
3740
+ const auto _Match = _mm_cmpestrm (_Data2, _Size_el_2, _Data1, _Part_size_el, _Op);
3741
+ const unsigned int _Match_val = _mm_cvtsi128_si32 (_Match);
3742
+ if (_Match_val != 0 && (_Check_unfit (_Match_val) || _Check_fit (_Match_val))) {
3743
+ return _Mid1;
3744
+ }
3745
+ }
3746
+
3747
+ // The first part, fit and unfit needle, mask out already processed positions
3748
+ if (const size_t _Tail_bytes_1 = _Size_bytes_1 & 0xF ; _Tail_bytes_1 != 0 ) {
3749
+ _Mid1 = _First1;
3750
+ const __m128i _Data1 = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(_Mid1));
3751
+ const auto _Match = _mm_cmpestrm (_Data2, _Size_el_2, _Data1, _Part_size_el, _Op);
3752
+ const unsigned int _Match_val = _mm_cvtsi128_si32 (_Match) & ((1 << _Tail_bytes_1) - 1 );
3753
+ if (_Match_val != 0 && (_Check_unfit (_Match_val) || _Check_fit (_Match_val))) {
3754
+ return _Mid1;
3755
+ }
3756
+ }
3757
+
3758
+ return _Last1;
3759
+ } else {
3760
+ const __m128i _Data2 = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(_First2));
3761
+
3762
+ const void * _Tail2 = _First2;
3763
+ _Advance_bytes (_Tail2, 16 );
3764
+
3765
+ const void * _Mid1 = _Last1;
3766
+ _Rewind_bytes (_Mid1, _Size_bytes_2);
3767
+
3768
+ const size_t _Size_diff_bytes = _Size_bytes_1 - _Size_bytes_2;
3769
+ const void * _Stop1 = _First1;
3770
+ _Advance_bytes (_Stop1, _Size_diff_bytes & 0xF );
3771
+
3772
+ #pragma warning(push)
3773
+ #pragma warning(disable : 4324) // structure was padded due to alignment specifier
3774
+ const auto _Check = [=, &_Mid1](long _Match) noexcept {
3775
+ while (_Match != 0 ) {
3776
+ const void * _Tmp1 = _Mid1;
3777
+ unsigned long _Match_last_pos;
3778
+
3779
+ // CodeQL [SM02313] Result is always initialized: we just tested that _Match is non-zero.
3780
+ _BitScanReverse (&_Match_last_pos, _Match);
3781
+
3782
+ bool _Match_1st_16 = true ;
3783
+
3784
+ if (_Match_last_pos != 0 ) {
3785
+ _Advance_bytes (_Tmp1, _Match_last_pos * sizeof (_Ty));
3786
+
3787
+ const __m128i _Match_data = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(_Tmp1));
3788
+ const __m128i _Cmp_result = _mm_xor_si128 (_Data2, _Match_data);
3789
+
3790
+ if (!_mm_testz_si128 (_Cmp_result, _Cmp_result)) {
3791
+ _Match_1st_16 = false ;
3792
+ }
3793
+ }
3794
+
3795
+ if (_Match_1st_16) {
3796
+ const void * _Tail1 = _Tmp1;
3797
+ _Advance_bytes (_Tail1, 16 );
3798
+
3799
+ if (memcmp (_Tail1, _Tail2, _Size_bytes_2 - 16 ) == 0 ) {
3800
+ _Mid1 = _Tmp1;
3801
+ return true ;
3802
+ }
3803
+ }
3804
+
3805
+ _bittestandreset (&_Match, _Match_last_pos);
3806
+ }
3807
+
3808
+ return false ;
3809
+ };
3810
+ #pragma warning(pop)
3811
+ // TRANSITION, DevCom-10689455, the code below could test with _mm_cmpestrc,
3812
+ // if it has been fused with _mm_cmpestrm.
3813
+
3814
+ // The main part, match all characters
3815
+ for (;;) {
3816
+ const __m128i _Data1 = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(_Mid1));
3817
+ const auto _Match = _mm_cmpestrm (_Data2, _Part_size_el, _Data1, _Part_size_el, _Op);
3818
+ const unsigned int _Match_val = _mm_cvtsi128_si32 (_Match);
3819
+ if (_Match_val != 0 && _Check (_Match_val)) {
3820
+ return _Mid1;
3821
+ }
3822
+
3823
+ if (_Mid1 == _Stop1) {
3824
+ break ;
3825
+ }
3826
+
3827
+ _Rewind_bytes (_Mid1, 16 );
3828
+ }
3829
+
3830
+ // The first part, mask out already processed positions
3831
+ if (const size_t _Tail_bytes_1 = _Size_diff_bytes & 0xF ; _Tail_bytes_1 != 0 ) {
3832
+ _Mid1 = _First1;
3833
+ const __m128i _Data1 = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(_Mid1));
3834
+ const auto _Match = _mm_cmpestrm (_Data2, _Part_size_el, _Data1, _Part_size_el, _Op);
3835
+ const unsigned int _Match_val = _mm_cvtsi128_si32 (_Match) & ((1 << _Tail_bytes_1) - 1 );
3836
+ if (_Match_val != 0 && _Check (_Match_val)) {
3837
+ return _Mid1;
3838
+ }
3839
+ }
3840
+
3841
+ return _Last1;
3842
+ }
3843
+ } else
3844
+ #endif // !defined(_M_ARM64EC)
3845
+ {
3846
+ auto _Ptr1 = static_cast <const _Ty*>(_Last1) - _Count2;
3847
+ const auto _Ptr2 = static_cast <const _Ty*>(_First2);
3848
+
3849
+ for (;;) {
3850
+ if (*_Ptr1 == *_Ptr2) {
3851
+ bool _Equal = true ;
3852
+
3853
+ for (size_t _Idx = 1 ; _Idx != _Count2; ++_Idx) {
3854
+ if (_Ptr1[_Idx] != _Ptr2[_Idx]) {
3855
+ _Equal = false ;
3856
+ break ;
3857
+ }
3858
+ }
3859
+
3860
+ if (_Equal) {
3861
+ return _Ptr1;
3862
+ }
3863
+ }
3864
+
3865
+ if (_Ptr1 == _First1) {
3866
+ return _Last1;
3867
+ }
3868
+
3869
+ --_Ptr1;
3870
+ }
3871
+ }
3872
+ }
3636
3873
} // unnamed namespace
3637
3874
3638
3875
extern " C" {
@@ -3757,6 +3994,16 @@ const void* __stdcall __std_search_2(
3757
3994
return __std_search_impl<_Find_traits_2, uint16_t >(_First1, _Last1, _First2, _Count2);
3758
3995
}
3759
3996
3997
+ const void * __stdcall __std_find_end_1 (
3998
+ const void * const _First1, const void * const _Last1, const void * const _First2, const size_t _Count2) noexcept {
3999
+ return __std_find_end_impl<_Find_traits_1, uint8_t >(_First1, _Last1, _First2, _Count2);
4000
+ }
4001
+
4002
+ const void * __stdcall __std_find_end_2 (
4003
+ const void * const _First1, const void * const _Last1, const void * const _First2, const size_t _Count2) noexcept {
4004
+ return __std_find_end_impl<_Find_traits_2, uint16_t >(_First1, _Last1, _First2, _Count2);
4005
+ }
4006
+
3760
4007
__declspec (noalias) size_t __stdcall __std_mismatch_1(
3761
4008
const void * const _First1, const void * const _First2, const size_t _Count) noexcept {
3762
4009
return __std_mismatch_impl<_Find_traits_1, uint8_t >(_First1, _First2, _Count);
0 commit comments