Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ BENCHOBJS := $(patsubst %.cpp, %.o, $(filter-out $(addprefix $(BENCHDIR)/, $(BEN
TESTOBJS := $(patsubst %.cpp, %.o, $(filter-out $(addprefix $(TESTDIR)/, $(TESTS_SKIP)), $(TESTS)))
UTILOBJS := $(UTILS:.cpp=.o)

# Stops make from wondering if it needs to generate the .hpp files (.cpp and .h have equivalent rules by default)
# Stops make from wondering if it needs to generate the .hpp files (.cpp and .h have equivalent rules by default)
%.hpp:

.PHONY: all
Expand Down Expand Up @@ -75,7 +75,7 @@ benchexe: $(BENCHOBJS) $(UTILOBJS)

.PHONY: meson
meson:
meson setup --warnlevel 0 --buildtype plain builddir
meson setup --warnlevel 2 --buildtype plain builddir
cd builddir && ninja

.PHONY: clean
Expand Down
26 changes: 13 additions & 13 deletions src/avx512-64bit-argsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)
zmm_t arrzmm[4];
argzmm_t argzmm[4];

#pragma X86_SIMD_SORT_UNROLL_LOOP(2)
X86_SIMD_SORT_UNROLL_LOOP(2)
for (int ii = 0; ii < 2; ++ii) {
argzmm[ii] = argtype::loadu(arg + 8 * ii);
arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
Expand All @@ -117,7 +117,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)

uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull;
opmask_t load_mask[2] = {0xFF, 0xFF};
#pragma X86_SIMD_SORT_UNROLL_LOOP(2)
X86_SIMD_SORT_UNROLL_LOOP(2)
for (int ii = 0; ii < 2; ++ii) {
load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF;
argzmm[ii + 2] = argtype::maskz_loadu(load_mask[ii], arg + 16 + 8 * ii);
Expand Down Expand Up @@ -151,7 +151,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
zmm_t arrzmm[8];
argzmm_t argzmm[8];

#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
argzmm[ii] = argtype::loadu(arg + 8 * ii);
arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
Expand All @@ -160,7 +160,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)

opmask_t load_mask[4] = {0xFF, 0xFF, 0xFF, 0xFF};
uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF;
argzmm[ii + 4] = argtype::maskz_loadu(load_mask[ii], arg + 32 + 8 * ii);
Expand All @@ -170,7 +170,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
argzmm[ii + 4]);
}

#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 8; ii = ii + 2) {
bitonic_merge_two_zmm_64bit<vtype, argtype>(
arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]);
Expand All @@ -179,11 +179,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
bitonic_merge_four_zmm_64bit<vtype, argtype>(arrzmm + 4, argzmm + 4);
bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm, argzmm);

#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
argtype::storeu(arg + 8 * ii, argzmm[ii]);
}
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
argtype::mask_storeu(arg + 32 + 8 * ii, load_mask[ii], argzmm[ii + 4]);
}
Expand All @@ -203,7 +203,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
// zmm_t arrzmm[16];
// argzmm_t argzmm[16];
//
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
//X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argzmm[ii] = argtype::loadu(arg + 8*ii);
// arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
Expand All @@ -213,19 +213,19 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
// opmask_t load_mask[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
// if (N != 128) {
// uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
//X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// load_mask[ii] = (combined_mask >> (ii*8)) & 0xFF;
// }
// }
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
//X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argzmm[ii+8] = argtype::maskz_loadu(load_mask[ii], arg + 64 + 8*ii);
// arrzmm[ii+8] = vtype::template mask_i64gather<sizeof(type_t)>(vtype::zmm_max(), load_mask[ii], argzmm[ii+8], arr);
// arrzmm[ii+8] = sort_zmm_64bit<vtype, argtype>(arrzmm[ii+8], argzmm[ii+8]);
// }
//
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
//X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 16; ii = ii + 2) {
// bitonic_merge_two_zmm_64bit<vtype, argtype>(arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]);
// }
Expand All @@ -237,11 +237,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
// bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm+8, argzmm+8);
// bitonic_merge_sixteen_zmm_64bit<vtype, argtype>(arrzmm, argzmm);
//
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
//X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argtype::storeu(arg + 8*ii, argzmm[ii]);
// }
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
//X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argtype::mask_storeu(arg + 64 + 8*ii, load_mask[ii], argzmm[ii + 8]);
// }
Expand Down
12 changes: 6 additions & 6 deletions src/avx512-common-argsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
// first and last vtype::numlanes values are partitioned at the end
zmm_t vec_left[num_unroll], vec_right[num_unroll];
argzmm_t argvec_left[num_unroll], argvec_right[num_unroll];
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
argvec_left[ii] = argtype::loadu(arg + left + vtype::numlanes * ii);
vec_left[ii] = vtype::template i64gather<sizeof(type_t)>(
Expand All @@ -224,7 +224,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
*/
if ((r_store + vtype::numlanes) - right < left - l_store) {
right -= num_unroll * vtype::numlanes;
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
arg_vec[ii]
= argtype::loadu(arg + right + ii * vtype::numlanes);
Expand All @@ -233,7 +233,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
}
}
else {
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
arg_vec[ii] = argtype::loadu(arg + left + ii * vtype::numlanes);
curr_vec[ii] = vtype::template i64gather<sizeof(type_t)>(
Expand All @@ -242,7 +242,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
left += num_unroll * vtype::numlanes;
}
// partition the current vector and save it on both sides of the array
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_gt_pivot
= partition_vec<vtype>(arg,
Expand All @@ -259,7 +259,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
}

/* partition and save vec_left and vec_right */
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_gt_pivot
= partition_vec<vtype>(arg,
Expand All @@ -273,7 +273,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
l_store += (vtype::numlanes - amount_gt_pivot);
r_store -= amount_gt_pivot;
}
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_gt_pivot
= partition_vec<vtype>(arg,
Expand Down
17 changes: 9 additions & 8 deletions src/avx512-common-qsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d

#define PRAGMA(x) _Pragma (#x)

/* Compiler specific macros specific */
#ifdef _MSC_VER
#define X86_SIMD_SORT_INLINE static inline
Expand All @@ -93,8 +95,7 @@
#endif

#if __GNUC__ >= 8
#define X86_SIMD_SORT_UNROLL_LOOP(num)\
GCC unroll num
#define X86_SIMD_SORT_UNROLL_LOOP(num) PRAGMA(GCC unroll num)
#else
#define X86_SIMD_SORT_UNROLL_LOOP(num)
#endif
Expand Down Expand Up @@ -393,7 +394,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
// We will now have atleast 16 registers worth of data to process:
// left and right vtype::numlanes values are partitioned at the end
zmm_t vec_left[num_unroll], vec_right[num_unroll];
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
vec_right[ii] = vtype::loadu(
Expand All @@ -414,20 +415,20 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
*/
if ((r_store + vtype::numlanes) - right < left - l_store) {
right -= num_unroll * vtype::numlanes;
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
}
}
else {
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
}
left += num_unroll * vtype::numlanes;
}
// partition the current vector and save it on both sides of the array
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
Expand All @@ -443,7 +444,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
}

/* partition and save vec_left[8] and vec_right[8] */
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
Expand All @@ -456,7 +457,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
l_store += (vtype::numlanes - amount_ge_pivot);
r_store -= amount_ge_pivot;
}
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
Expand Down
6 changes: 4 additions & 2 deletions tests/test-argselect.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,15 @@ TYPED_TEST_P(avx512argselect, test_random)
= avx512_argselect<TypeParam>(arr.data(), k, arr.size());
auto true_kth = arr[sorted_inx[k]];
EXPECT_EQ(true_kth, arr[inx[k]]) << "Failed at index k = " << k;
if (k >= 1)
if (k >= 1) {
EXPECT_GE(true_kth, std_max_element(arr, inx, 0, k - 1))
<< "failed at k = " << k;
if (k != arrsize - 1)
}
if (k != arrsize - 1) {
EXPECT_LE(true_kth,
std_min_element(arr, inx, k + 1, arrsize - 1))
<< "failed at k = " << k;
}
EXPECT_UNIQUE(inx)
}
}
Expand Down
18 changes: 9 additions & 9 deletions tests/test-argsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ TYPED_TEST_P(avx512argsort, test_random)
std::vector<int64_t> inx2
= avx512_argsort<TypeParam>(arr.data(), arr.size());
std::vector<TypeParam> sort1, sort2;
for (size_t jj = 0; jj < size; ++jj) {
for (auto jj = 0; jj < size; ++jj) {
sort1.push_back(arr[inx1[jj]]);
sort2.push_back(arr[inx2[jj]]);
}
Expand All @@ -48,14 +48,14 @@ TYPED_TEST_P(avx512argsort, test_constant)
for (auto &size : arrsizes) {
/* constant array */
auto elem = get_uniform_rand_array<TypeParam>(1)[0];
for (int64_t jj = 0; jj < size; ++jj) {
for (auto jj = 0; jj < size; ++jj) {
arr.push_back(elem);
}
std::vector<int64_t> inx1 = std_argsort(arr);
std::vector<int64_t> inx2
= avx512_argsort<TypeParam>(arr.data(), arr.size());
std::vector<TypeParam> sort1, sort2;
for (size_t jj = 0; jj < size; ++jj) {
for (auto jj = 0; jj < size; ++jj) {
sort1.push_back(arr[inx1[jj]]);
sort2.push_back(arr[inx2[jj]]);
}
Expand Down Expand Up @@ -84,7 +84,7 @@ TYPED_TEST_P(avx512argsort, test_small_range)
std::vector<int64_t> inx2
= avx512_argsort<TypeParam>(arr.data(), arr.size());
std::vector<TypeParam> sort1, sort2;
for (size_t jj = 0; jj < size; ++jj) {
for (auto jj = 0; jj < size; ++jj) {
sort1.push_back(arr[inx1[jj]]);
sort2.push_back(arr[inx2[jj]]);
}
Expand Down Expand Up @@ -113,7 +113,7 @@ TYPED_TEST_P(avx512argsort, test_sorted)
std::vector<int64_t> inx2
= avx512_argsort<TypeParam>(arr.data(), arr.size());
std::vector<TypeParam> sort1, sort2;
for (size_t jj = 0; jj < size; ++jj) {
for (auto jj = 0; jj < size; ++jj) {
sort1.push_back(arr[inx1[jj]]);
sort2.push_back(arr[inx2[jj]]);
}
Expand Down Expand Up @@ -143,7 +143,7 @@ TYPED_TEST_P(avx512argsort, test_reverse)
std::vector<int64_t> inx2
= avx512_argsort<TypeParam>(arr.data(), arr.size());
std::vector<TypeParam> sort1, sort2;
for (size_t jj = 0; jj < size; ++jj) {
for (auto jj = 0; jj < size; ++jj) {
sort1.push_back(arr[inx1[jj]]);
sort2.push_back(arr[inx2[jj]]);
}
Expand Down Expand Up @@ -177,7 +177,7 @@ TYPED_TEST_P(avx512argsort, test_array_with_nan)
std::vector<int64_t> inx
= avx512_argsort<TypeParam>(arr.data(), arr.size());
std::vector<TypeParam> sort1;
for (size_t jj = 0; jj < size; ++jj) {
for (auto jj = 0; jj < size; ++jj) {
sort1.push_back(arr[inx[jj]]);
}
if ((!std::isnan(sort1[size - 1])) || (!std::isnan(sort1[size - 2]))) {
Expand Down Expand Up @@ -211,7 +211,7 @@ TYPED_TEST_P(avx512argsort, test_max_value_at_end_of_array)
}
std::vector<int64_t> inx = avx512_argsort(arr.data(), arr.size());
std::vector<TypeParam> sorted;
for (size_t jj = 0; jj < size; ++jj) {
for (auto jj = 0; jj < size; ++jj) {
sorted.push_back(arr[inx[jj]]);
}
if (!std::is_sorted(sorted.begin(), sorted.end())) {
Expand Down Expand Up @@ -250,7 +250,7 @@ TYPED_TEST_P(avx512argsort, test_all_inf_array)
}
std::vector<int64_t> inx = avx512_argsort(arr.data(), arr.size());
std::vector<TypeParam> sorted;
for (size_t jj = 0; jj < size; ++jj) {
for (auto jj = 0; jj < size; ++jj) {
sorted.push_back(arr[inx[jj]]);
}
if (!std::is_sorted(sorted.begin(), sorted.end())) {
Expand Down
4 changes: 2 additions & 2 deletions tests/test-partial-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ TYPED_TEST_P(avx512_partial_sort, test_ranges)
/* Sort with std::sort for comparison */
std::sort(sortedarr.begin(), sortedarr.end());

for (size_t ii = 0; ii < nranges; ++ii) {
for (auto ii = 0; ii < nranges; ++ii) {
psortedarr = arr;

/* Pick a random number of elements to sort at the beginning of the array */
Expand All @@ -33,7 +33,7 @@ TYPED_TEST_P(avx512_partial_sort, test_ranges)
/* Sort the range and verify all the required elements match the presorted set */
avx512_partial_qsort<TypeParam>(
psortedarr.data(), k, psortedarr.size());
for (size_t jj = 0; jj < k; jj++) {
for (auto jj = 0; jj < k; jj++) {
ASSERT_EQ(sortedarr[jj], psortedarr[jj]);
}

Expand Down