diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h index 532da825..288f85d0 100644 --- a/src/avx512-16bit-common.h +++ b/src/avx512-16bit-common.h @@ -99,38 +99,11 @@ struct avx512_16bit_swizzle_ops { __m512i v = vtype::cast_to(reg); if constexpr (scale == 2) { - __m512i mask = _mm512_set_epi16(30, - 31, - 28, - 29, - 26, - 27, - 24, - 25, - 22, - 23, - 20, - 21, - 18, - 19, - 16, - 17, - 14, - 15, - 12, - 13, - 10, - 11, - 8, - 9, - 6, - 7, - 4, - 5, - 2, - 3, - 0, - 1); + std::vector arr + = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, + 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, + 23, 22, 25, 24, 27, 26, 29, 28, 31, 30}; + __m512i mask = _mm512_loadu_si512(arr.data()); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 4) { @@ -160,108 +133,27 @@ struct avx512_16bit_swizzle_ops { if constexpr (scale == 2) { return swap_n(reg); } else if constexpr (scale == 4) { - __m512i mask = _mm512_set_epi16(28, - 29, - 30, - 31, - 24, - 25, - 26, - 27, - 20, - 21, - 22, - 23, - 16, - 17, - 18, - 19, - 12, - 13, - 14, - 15, - 8, - 9, - 10, - 11, - 4, - 5, - 6, - 7, - 0, - 1, - 2, - 3); + std::vector arr + = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, + 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, + 21, 20, 27, 26, 25, 24, 31, 30, 29, 28}; + __m512i mask = _mm512_loadu_si512(arr.data()); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 8) { - __m512i mask = _mm512_set_epi16(24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7); + std::vector arr + = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, + 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, + 17, 16, 31, 30, 29, 28, 27, 26, 25, 24}; + __m512i mask = _mm512_loadu_si512(arr.data()); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 16) { - __m512i mask = _mm512_set_epi16(16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15); + std::vector arr + = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, + 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, + 25, 24, 23, 22, 21, 20, 19, 18, 17, 16}; + __m512i mask = _mm512_loadu_si512(arr.data()); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 32) { diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index b969a069..1fdf3627 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -41,6 +41,7 @@ #include #include #include +#include #define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() #define X86_SIMD_SORT_INFINITYF std::numeric_limits::infinity() @@ -249,7 +250,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store, reg_t &biggest_vec) { typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec); - arrsize_t amount_ge_pivot = _mm_popcnt_u64(ge_mask); + int amount_ge_pivot = _mm_popcnt_u32((int)ge_mask); vtype::mask_compressstoreu(l_store, vtype::knot_opmask(ge_mask), curr_vec); vtype::mask_compressstoreu( @@ -450,8 +451,8 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes); - _mm_prefetch(arr + right + ii * vtype::numlanes - - num_unroll * vtype::numlanes, + _mm_prefetch((char *)(arr + right + ii * vtype::numlanes + - num_unroll * vtype::numlanes), _MM_HINT_T0); } } @@ -459,8 +460,8 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes); - _mm_prefetch(arr + left + ii * vtype::numlanes - + num_unroll * vtype::numlanes, + _mm_prefetch((char *)(arr + left + ii * vtype::numlanes + + num_unroll * vtype::numlanes), _MM_HINT_T0); } left += num_unroll * vtype::numlanes;