Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 20 additions & 128 deletions src/avx512-16bit-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,38 +99,11 @@ struct avx512_16bit_swizzle_ops {
__m512i v = vtype::cast_to(reg);

if constexpr (scale == 2) {
__m512i mask = _mm512_set_epi16(30,
31,
28,
29,
26,
27,
24,
25,
22,
23,
20,
21,
18,
19,
16,
17,
14,
15,
12,
13,
10,
11,
8,
9,
6,
7,
4,
5,
2,
3,
0,
1);
std::vector<uint16_t> arr
= {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
__m512i mask = _mm512_loadu_si512(arr.data());
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 4) {
Expand Down Expand Up @@ -160,108 +133,27 @@ struct avx512_16bit_swizzle_ops {

if constexpr (scale == 2) { return swap_n<vtype, 2>(reg); }
else if constexpr (scale == 4) {
__m512i mask = _mm512_set_epi16(28,
29,
30,
31,
24,
25,
26,
27,
20,
21,
22,
23,
16,
17,
18,
19,
12,
13,
14,
15,
8,
9,
10,
11,
4,
5,
6,
7,
0,
1,
2,
3);
std::vector<uint16_t> arr
= {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
__m512i mask = _mm512_loadu_si512(arr.data());
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 8) {
__m512i mask = _mm512_set_epi16(24,
25,
26,
27,
28,
29,
30,
31,
16,
17,
18,
19,
20,
21,
22,
23,
8,
9,
10,
11,
12,
13,
14,
15,
0,
1,
2,
3,
4,
5,
6,
7);
std::vector<uint16_t> arr
= {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
__m512i mask = _mm512_loadu_si512(arr.data());
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 16) {
__m512i mask = _mm512_set_epi16(16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15);
std::vector<uint16_t> arr
= {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26,
25, 24, 23, 22, 21, 20, 19, 18, 17, 16};
__m512i mask = _mm512_loadu_si512(arr.data());
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 32) {
Expand Down
11 changes: 6 additions & 5 deletions src/avx512-common-qsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include <cstring>
#include <immintrin.h>
#include <limits>
#include <vector>

#define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
#define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
Expand Down Expand Up @@ -249,7 +250,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store,
reg_t &biggest_vec)
{
typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
arrsize_t amount_ge_pivot = _mm_popcnt_u64(ge_mask);
int amount_ge_pivot = _mm_popcnt_u32((int)ge_mask);

vtype::mask_compressstoreu(l_store, vtype::knot_opmask(ge_mask), curr_vec);
vtype::mask_compressstoreu(
Expand Down Expand Up @@ -450,17 +451,17 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
_mm_prefetch(arr + right + ii * vtype::numlanes
- num_unroll * vtype::numlanes,
_mm_prefetch((char *)(arr + right + ii * vtype::numlanes
- num_unroll * vtype::numlanes),
_MM_HINT_T0);
}
}
else {
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
_mm_prefetch(arr + left + ii * vtype::numlanes
+ num_unroll * vtype::numlanes,
_mm_prefetch((char *)(arr + left + ii * vtype::numlanes
+ num_unroll * vtype::numlanes),
_MM_HINT_T0);
}
left += num_unroll * vtype::numlanes;
Expand Down