Skip to content

Commit cbd091e

Browse files
Vectorize rotate better (#5502)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent 2391e5e commit cbd091e

File tree

7 files changed

+283
-1
lines changed

7 files changed

+283
-1
lines changed

benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ add_benchmark(regex_search src/regex_search.cpp)
120120
add_benchmark(remove src/remove.cpp)
121121
add_benchmark(replace src/replace.cpp)
122122
add_benchmark(reverse src/reverse.cpp)
123+
add_benchmark(rotate src/rotate.cpp)
123124
add_benchmark(search src/search.cpp)
124125
add_benchmark(search_n src/search_n.cpp)
125126
add_benchmark(std_copy src/std_copy.cpp)

benchmarks/src/rotate.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
4+
#include <algorithm>
5+
#include <benchmark/benchmark.h>
6+
#include <cstdint>
7+
#include <vector>
8+
9+
#include "skewed_allocator.hpp"
10+
#include "utility.hpp"
11+
12+
using namespace std;
13+
14+
enum class AlgType { Std, Rng };
15+
16+
template <class T, AlgType Alg>
17+
void bm_rotate(benchmark::State& state) {
18+
const auto size = static_cast<size_t>(state.range(0));
19+
const auto n = static_cast<size_t>(state.range(1));
20+
21+
auto v = random_vector<T, not_highly_aligned_allocator>(size);
22+
benchmark::DoNotOptimize(v);
23+
24+
for (auto _ : state) {
25+
if constexpr (Alg == AlgType::Std) {
26+
rotate(v.begin(), v.begin() + n, v.end());
27+
} else {
28+
ranges::rotate(v, v.begin() + n);
29+
}
30+
benchmark::DoNotOptimize(v);
31+
}
32+
}
33+
34+
void common_args(auto bm) {
35+
bm->Args({3333, 2242})->Args({3332, 1666})->Args({3333, 1111})->Args({3333, 501});
36+
bm->Args({3333, 3300})->Args({3333, 12})->Args({3333, 5})->Args({3333, 1});
37+
bm->Args({333, 101})->Args({123, 32})->Args({23, 7})->Args({12, 5})->Args({3, 2});
38+
}
39+
40+
struct color {
41+
uint16_t h;
42+
uint16_t s;
43+
uint16_t l;
44+
};
45+
46+
BENCHMARK(bm_rotate<uint8_t, AlgType::Std>)->Apply(common_args);
47+
BENCHMARK(bm_rotate<uint8_t, AlgType::Rng>)->Apply(common_args);
48+
BENCHMARK(bm_rotate<uint16_t, AlgType::Std>)->Apply(common_args);
49+
BENCHMARK(bm_rotate<uint16_t, AlgType::Rng>)->Apply(common_args);
50+
BENCHMARK(bm_rotate<uint32_t, AlgType::Std>)->Apply(common_args);
51+
BENCHMARK(bm_rotate<uint32_t, AlgType::Rng>)->Apply(common_args);
52+
BENCHMARK(bm_rotate<uint64_t, AlgType::Std>)->Apply(common_args);
53+
BENCHMARK(bm_rotate<uint64_t, AlgType::Rng>)->Apply(common_args);
54+
55+
BENCHMARK(bm_rotate<color, AlgType::Std>)->Apply(common_args);
56+
BENCHMARK(bm_rotate<color, AlgType::Rng>)->Apply(common_args);
57+
58+
BENCHMARK_MAIN();

stl/inc/algorithm

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5776,6 +5776,19 @@ namespace ranges {
57765776
}
57775777

57785778
if constexpr (bidirectional_iterator<_It>) {
5779+
#if _USE_STD_VECTOR_ALGORITHMS
5780+
using _Elem = remove_reference_t<iter_reference_t<_It>>;
5781+
5782+
if constexpr (contiguous_iterator<_It> && sized_sentinel_for<_Se, _It>
5783+
&& conjunction_v<_Is_trivially_ranges_swappable<_Elem>, negation<is_volatile<_Elem>>>) {
5784+
if (!_STD is_constant_evaluated()) {
5785+
const _It _Last_it = _First + (_Last - _First);
5786+
::__std_rotate(_STD to_address(_First), _STD to_address(_Mid), _STD to_address(_Last_it));
5787+
return {_First + (_Last - _Mid), _Last};
5788+
}
5789+
}
5790+
#endif // _USE_STD_VECTOR_ALGORITHMS
5791+
57795792
_RANGES _Reverse_common(_First, _Mid);
57805793
auto _Final = _RANGES _Get_final_iterator_unwrapped<_It>(_Mid, _STD move(_Last));
57815794
_RANGES _Reverse_common(_Mid, _Final);

stl/inc/xutility

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_8(void* _Firs
7979
__declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias(
8080
void* _First1, void* _Last1, void* _First2) noexcept;
8181

82+
__declspec(noalias) void __stdcall __std_rotate(void* _First, void* _Mid, void* _Last) noexcept;
83+
8284
__declspec(noalias) size_t __stdcall __std_count_trivial_1(
8385
const void* _First, const void* _Last, uint8_t _Val) noexcept;
8486
__declspec(noalias) size_t __stdcall __std_count_trivial_2(
@@ -6597,6 +6599,17 @@ _CONSTEXPR20 _FwdIt rotate(_FwdIt _First, _FwdIt _Mid, _FwdIt _Last) {
65976599
}
65986600

65996601
if constexpr (_Is_cpp17_random_iter_v<_FwdIt>) {
6602+
#if _USE_STD_VECTOR_ALGORITHMS
6603+
using _Elem = remove_reference_t<_Iter_ref_t<decltype(_UFirst)>>;
6604+
6605+
if constexpr (conjunction_v<bool_constant<_Iterator_is_contiguous<decltype(_UFirst)>>,
6606+
_Is_trivially_swappable<_Elem>, negation<is_volatile<_Elem>>>) {
6607+
if (!_STD _Is_constant_evaluated()) {
6608+
::__std_rotate(_STD _To_address(_UFirst), _STD _To_address(_UMid), _STD _To_address(_ULast));
6609+
return _First + (_Last - _Mid);
6610+
}
6611+
}
6612+
#endif // _USE_STD_VECTOR_ALGORITHMS
66006613
_STD reverse(_UFirst, _UMid);
66016614
_STD reverse(_UMid, _ULast);
66026615
_STD reverse(_UFirst, _ULast);

stl/src/vector_algorithms.cpp

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,107 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8(
382382

383383
} // extern "C"
384384

385+
namespace {
386+
namespace _Rotating {
387+
// TRANSITION, GH-5506 "VCRuntime: memmove() is surprisingly slow for more than 8 KB on certain CPUs":
388+
// As a workaround, the following code calls memmove() for 8 KB portions.
389+
constexpr size_t _Portion_size = 8192;
390+
constexpr size_t _Portion_mask = _Portion_size - 1;
391+
static_assert((_Portion_size & _Portion_mask) == 0);
392+
393+
void _Move_to_lower_address(void* _Dest, const void* _Src, const size_t _Size) noexcept {
394+
const size_t _Whole_portions_size = _Size & ~_Portion_mask;
395+
396+
void* _Dest_end = _Dest;
397+
_Advance_bytes(_Dest_end, _Whole_portions_size);
398+
399+
while (_Dest != _Dest_end) {
400+
memmove(_Dest, _Src, _Portion_size);
401+
_Advance_bytes(_Dest, _Portion_size);
402+
_Advance_bytes(_Src, _Portion_size);
403+
}
404+
405+
if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0) {
406+
memmove(_Dest, _Src, _Tail);
407+
}
408+
}
409+
410+
void _Move_to_higher_address(void* const _Dest, const void* const _Src, const size_t _Size) noexcept {
411+
const size_t _Whole_portions_size = _Size & ~_Portion_mask;
412+
413+
void* _Dest_end = _Dest;
414+
_Advance_bytes(_Dest_end, _Whole_portions_size);
415+
const void* _Src_end = _Src;
416+
_Advance_bytes(_Src_end, _Whole_portions_size);
417+
418+
if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0) {
419+
memmove(_Dest_end, _Src_end, _Tail);
420+
}
421+
422+
while (_Dest_end != _Dest) {
423+
_Rewind_bytes(_Dest_end, _Portion_size);
424+
_Rewind_bytes(_Src_end, _Portion_size);
425+
memmove(_Dest_end, _Src_end, _Portion_size);
426+
}
427+
}
428+
429+
constexpr size_t _Buf_size = 512;
430+
431+
bool _Use_buffer(const size_t _Smaller, const size_t _Larger) noexcept {
432+
return _Smaller <= _Buf_size && (_Smaller <= 128 || _Larger >= _Smaller * 2);
433+
}
434+
} // namespace _Rotating
435+
} // unnamed namespace
436+
437+
extern "C" {
438+
439+
__declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid, void* _Last) noexcept {
440+
unsigned char _Buf[_Rotating::_Buf_size];
441+
442+
for (;;) {
443+
const size_t _Left = _Byte_length(_First, _Mid);
444+
const size_t _Right = _Byte_length(_Mid, _Last);
445+
446+
if (_Left <= _Right) {
447+
if (_Left == 0) {
448+
break;
449+
}
450+
451+
if (_Rotating::_Use_buffer(_Left, _Right)) {
452+
memcpy(_Buf, _First, _Left);
453+
_Rotating::_Move_to_lower_address(_First, _Mid, _Right);
454+
_Advance_bytes(_First, _Right);
455+
memcpy(_First, _Buf, _Left);
456+
break;
457+
}
458+
459+
void* _Mid2 = _Last;
460+
_Rewind_bytes(_Mid2, _Left);
461+
__std_swap_ranges_trivially_swappable_noalias(_Mid2, _Last, _First);
462+
_Last = _Mid2;
463+
} else {
464+
if (_Right == 0) {
465+
break;
466+
}
467+
468+
if (_Rotating::_Use_buffer(_Right, _Left)) {
469+
_Rewind_bytes(_Last, _Right);
470+
memcpy(_Buf, _Last, _Right);
471+
void* _Mid2 = _First;
472+
_Advance_bytes(_Mid2, _Right);
473+
_Rotating::_Move_to_higher_address(_Mid2, _First, _Left);
474+
memcpy(_First, _Buf, _Right);
475+
break;
476+
}
477+
478+
__std_swap_ranges_trivially_swappable_noalias(_Mid, _Last, _First);
479+
_Advance_bytes(_First, _Right);
480+
}
481+
}
482+
}
483+
484+
} // extern "C"
485+
385486
namespace {
386487
namespace _Sorting {
387488
enum _Min_max_mode {

tests/std/tests/GH_005421_vector_algorithms_integer_class_type_iterator/test.cpp

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ int main() {
9898
picky_contiguous_iterator float_arr_begin(begin(float_arr));
9999
picky_contiguous_iterator float_arr_end(end(float_arr));
100100

101-
transform(arr_begin, arr_end, float_arr_begin, [](int v) { return static_cast<float>(v); });
101+
transform(arr_begin, arr_end, float_arr_begin, [](const int v) { return static_cast<float>(v); });
102102

103103
assert(ranges::min(ranges::subrange(float_arr_begin, float_arr_end)) == 200.0);
104104
assert(ranges::max(ranges::subrange(float_arr_begin, float_arr_end)) == 390.0);
@@ -196,6 +196,30 @@ int main() {
196196
ranges::reverse(temp_begin, temp_end);
197197
assert(ranges::equal(temp_begin, temp_end, begin(reverse_expected), end(reverse_expected)));
198198
}
199+
{
200+
const int rotate_expected[] = {
201+
250, 270, 280, 290, 300, 310, 320, 250, 340, 250, 250, 370, 380, 390, 200, 210, 220, 250, 240, 250};
202+
203+
const _Signed128 rotate_pos = 6;
204+
205+
auto rot_copy_it = rotate_copy(arr_begin, arr_begin + rotate_pos, arr_end, temp_begin);
206+
assert(equal(temp_begin, temp_end, begin(rotate_expected), end(rotate_expected)));
207+
assert(rot_copy_it == temp_end);
208+
209+
copy(arr_begin, arr_end, temp_begin);
210+
auto rot_it = rotate(temp_begin, temp_begin + rotate_pos, temp_end);
211+
assert(equal(temp_begin, temp_end, begin(rotate_expected), end(rotate_expected)));
212+
assert(rot_it == temp_end - rotate_pos);
213+
214+
auto r_rot_copy_it = ranges::rotate_copy(arr_begin, arr_begin + rotate_pos, arr_end, temp_begin).out;
215+
assert(ranges::equal(temp_begin, temp_end, begin(rotate_expected), end(rotate_expected)));
216+
assert(r_rot_copy_it == temp_end);
217+
218+
ranges::copy(arr_begin, arr_end, temp_begin);
219+
auto r_rot_it = begin(ranges::rotate(temp_begin, temp_begin + rotate_pos, temp_end));
220+
assert(ranges::equal(temp_begin, temp_end, begin(rotate_expected), end(rotate_expected)));
221+
assert(r_rot_it == temp_end - rotate_pos);
222+
}
199223
{
200224
// Out of replace family, only replace for 32-bit and 64-bit elements is manually vectorized,
201225
// replace_copy is auto vectorized (along with replace_copy_if)

tests/std/tests/VSO_0000000_vector_algorithms/test.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,65 @@ void test_reverse_copy(mt19937_64& gen) {
743743
}
744744
}
745745

746+
template <class RanIt>
747+
void last_known_good_rotate(
748+
RanIt first, RanIt mid, RanIt last, vector<typename iterator_traits<RanIt>::value_type>& tmp) {
749+
const auto size_left = mid - first;
750+
const auto size_right = last - mid;
751+
if (size_left <= size_right) {
752+
tmp.assign(first, mid);
753+
move_backward(mid, last, last - size_left);
754+
move(tmp.begin(), tmp.end(), last - size_left);
755+
} else {
756+
tmp.assign(mid, last);
757+
move(first, mid, first + size_right);
758+
move(tmp.begin(), tmp.end(), first);
759+
}
760+
}
761+
762+
template <class T>
763+
void test_case_rotate(
764+
vector<T>& actual, vector<T>& actual_r, vector<T>& expected, const ptrdiff_t pos, vector<T>& tmp) {
765+
const ptrdiff_t shift = static_cast<ptrdiff_t>(expected.size()) - pos;
766+
last_known_good_rotate(expected.begin(), expected.begin() + pos, expected.end(), tmp);
767+
const auto it = rotate(actual.begin(), actual.begin() + pos, actual.end());
768+
assert(expected == actual);
769+
assert(it == actual.begin() + shift);
770+
#if _HAS_CXX20
771+
const auto rng = ranges::rotate(actual_r.begin(), actual_r.begin() + pos, actual_r.end());
772+
assert(expected == actual_r);
773+
assert(begin(rng) == actual_r.begin() + shift);
774+
assert(end(rng) == actual_r.end());
775+
#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv
776+
(void) actual_r;
777+
#endif // ^^^ !_HAS_CXX20 ^^^
778+
}
779+
780+
template <class T>
781+
void test_rotate(mt19937_64& gen) {
782+
vector<T> actual;
783+
vector<T> actual_r;
784+
vector<T> expected;
785+
vector<T> tmp;
786+
actual.reserve(dataCount);
787+
actual_r.reserve(dataCount);
788+
expected.reserve(dataCount);
789+
tmp.reserve(dataCount);
790+
test_case_rotate(actual, actual_r, expected, 0, tmp);
791+
for (size_t attempts = 0; attempts < dataCount; ++attempts) {
792+
const T val = static_cast<T>(gen()); // intentionally narrows
793+
actual.push_back(val);
794+
actual_r.push_back(val);
795+
expected.push_back(val);
796+
797+
uniform_int_distribution<ptrdiff_t> dis_pos(0, static_cast<ptrdiff_t>(attempts) + 1);
798+
799+
for (size_t pos_count = 0; pos_count != 5; ++pos_count) {
800+
test_case_rotate(actual, actual_r, expected, dis_pos(gen), tmp);
801+
}
802+
}
803+
}
804+
746805
template <class FwdIt1, class FwdIt2>
747806
FwdIt2 last_known_good_swap_ranges(FwdIt1 first1, const FwdIt1 last1, FwdIt2 dest) {
748807
for (; first1 != last1; ++first1, ++dest) {
@@ -1182,6 +1241,19 @@ void test_vector_algorithms(mt19937_64& gen) {
11821241
test_reverse_copy<double>(gen);
11831242
test_reverse_copy<long double>(gen);
11841243

1244+
test_rotate<char>(gen);
1245+
test_rotate<signed char>(gen);
1246+
test_rotate<unsigned char>(gen);
1247+
test_rotate<short>(gen);
1248+
test_rotate<unsigned short>(gen);
1249+
test_rotate<int>(gen);
1250+
test_rotate<unsigned int>(gen);
1251+
test_rotate<long long>(gen);
1252+
test_rotate<unsigned long long>(gen);
1253+
test_rotate<float>(gen);
1254+
test_rotate<double>(gen);
1255+
test_rotate<long double>(gen);
1256+
11851257
test_remove<char>(gen);
11861258
test_remove<signed char>(gen);
11871259
test_remove<unsigned char>(gen);

0 commit comments

Comments
 (0)