diff --git a/Makefile b/Makefile index df55afbf..6169ec1b 100644 --- a/Makefile +++ b/Makefile @@ -38,4 +38,4 @@ meson: cd builddir && ninja clean: - $(RM) -rf $(TESTDIR)/*.o $(UTILS)/*.o testexe benchexe builddir + $(RM) -rf $(TESTDIR)/*.o $(BENCHDIR)/*.o $(UTILS)/*.o testexe benchexe builddir diff --git a/benchmarks/bench-qsort-common.h b/benchmarks/bench-qsort-common.h new file mode 100644 index 00000000..fe0decf7 --- /dev/null +++ b/benchmarks/bench-qsort-common.h @@ -0,0 +1,11 @@ +#ifndef AVX512_BENCH_COMMON +#define AVX512_BENCH_COMMON + +#include +#include "rand_array.h" +#include "cpuinfo.h" +#include "avx512-16bit-qsort.hpp" +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-qsort.hpp" + +#endif diff --git a/benchmarks/bench_partial_qsort.hpp b/benchmarks/bench_partial_qsort.hpp new file mode 100644 index 00000000..d54ceb31 --- /dev/null +++ b/benchmarks/bench_partial_qsort.hpp @@ -0,0 +1,72 @@ +#include "bench-qsort-common.h" + +template +static void avx512_partial_qsort(benchmark::State& state) { + if (!cpu_has_avx512bw()) { + state.SkipWithMessage("Requires AVX512 BW ISA"); + } + if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) { + state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); + } + // Perform setup here + int64_t K = state.range(0); + size_t ARRSIZE = 10000; + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* call avx512_partial_qsort */ + for (auto _ : state) { + avx512_partial_qsort(arr.data(), K, ARRSIZE); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +template +static void stdpartialsort(benchmark::State& state) { + // Perform setup here + int64_t K = state.range(0); + size_t ARRSIZE = 10000; + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* call std::partial_sort */ + for (auto _ : state) { + std::partial_sort(arr.begin(), arr.begin() + K, arr.end()); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +// Register the function as a benchmark +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); + +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); + +//BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); diff --git a/benchmarks/bench_qselect.hpp b/benchmarks/bench_qselect.hpp new file mode 100644 index 00000000..fea5bea4 --- /dev/null +++ b/benchmarks/bench_qselect.hpp @@ -0,0 +1,72 @@ +#include "bench-qsort-common.h" + +template +static void avx512_qselect(benchmark::State& state) { + if (!cpu_has_avx512bw()) { + state.SkipWithMessage("Requires AVX512 BW ISA"); + } + if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) { + state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); + } + // Perform setup here + int64_t K = state.range(0); + size_t ARRSIZE = 10000; + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* call avx512 quickselect */ + for (auto _ : state) { + avx512_qselect(arr.data(), K, ARRSIZE); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +template +static void stdnthelement(benchmark::State& state) { + // Perform setup here + int64_t K = state.range(0); + size_t ARRSIZE = 10000; + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* call std::nth_element */ + for (auto _ : state) { + std::nth_element(arr.begin(), arr.begin() + K, arr.end()); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +// Register the function as a benchmark +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); + +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); + +//BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); diff --git a/benchmarks/bench_qsort.cpp b/benchmarks/bench_qsort.cpp index 3f622fc3..d14bf218 100644 --- a/benchmarks/bench_qsort.cpp +++ b/benchmarks/bench_qsort.cpp @@ -1,73 +1,3 @@ -#include -#include "rand_array.h" -#include "cpuinfo.h" -#include "avx512-16bit-qsort.hpp" -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-qsort.hpp" - -template -static void avx512_qsort(benchmark::State& state) { - if (!cpu_has_avx512bw()) { - state.SkipWithMessage("Requires AVX512 BW ISA"); - } - if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) { - state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); - } - // Perform setup here - size_t ARRSIZE = state.range(0); - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements is reverse order */ - arr = get_uniform_rand_array(ARRSIZE); - arr_bkp = arr; - - /* call avx512 quicksort */ - for (auto _ : state) { - avx512_qsort(arr.data(), ARRSIZE); - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } -} - -template -static void stdsort(benchmark::State& state) { - // Perform setup here - size_t ARRSIZE = state.range(0); - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements is reverse order */ - arr = get_uniform_rand_array(ARRSIZE); - arr_bkp = arr; - - /* call avx512 quicksort */ - for (auto _ : state) { - std::sort(arr.begin(), arr.end()); - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } -} - -// Register the function as a benchmark -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); - -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(10000000); - -//BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); -BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort)->Arg(10000)->Arg(10000000); +#include "bench_qsort.hpp" +#include "bench_qselect.hpp" +#include "bench_partial_qsort.hpp" diff --git a/benchmarks/bench_qsort.hpp b/benchmarks/bench_qsort.hpp new file mode 100644 index 00000000..6659fdae --- /dev/null +++ b/benchmarks/bench_qsort.hpp @@ -0,0 +1,68 @@ +#include "bench-qsort-common.h" + +template +static void avx512_qsort(benchmark::State& state) { + if (!cpu_has_avx512bw()) { + state.SkipWithMessage("Requires AVX512 BW ISA"); + } + if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) { + state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); + } + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* call avx512 quicksort */ + for (auto _ : state) { + avx512_qsort(arr.data(), ARRSIZE); + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +template +static void stdsort(benchmark::State& state) { + // Perform setup here + size_t ARRSIZE = state.range(0); + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + arr = get_uniform_rand_array(ARRSIZE); + arr_bkp = arr; + + /* call std::sort */ + for (auto _ : state) { + std::sort(arr.begin(), arr.end()); + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } +} + +// Register the function as a benchmark +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); + +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); + +//BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); +BENCHMARK(avx512_qsort)->Arg(10000)->Arg(1000000); +BENCHMARK(stdsort)->Arg(10000)->Arg(1000000); diff --git a/benchmarks/bench_qsortfp16.cpp b/benchmarks/bench_qsortfp16.cpp index 0b454f7e..eddd876d 100644 --- a/benchmarks/bench_qsortfp16.cpp +++ b/benchmarks/bench_qsortfp16.cpp @@ -45,7 +45,7 @@ static void stdsort(benchmark::State& state) { } arr_bkp = arr; - /* call avx512 quicksort */ + /* call std::sort */ for (auto _ : state) { std::sort(arr.begin(), arr.end()); state.PauseTiming(); @@ -61,3 +61,131 @@ static void stdsort(benchmark::State& state) { // Register the function as a benchmark BENCHMARK(avx512_qsort<_Float16>)->Arg(10000)->Arg(1000000); BENCHMARK(stdsort<_Float16>)->Arg(10000)->Arg(1000000); + +template +static void avx512_qselect(benchmark::State& state) { + if (cpu_has_avx512fp16()) { + // Perform setup here + int64_t K = state.range(0); + size_t ARRSIZE = 10000; + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + for (size_t jj = 0; jj < ARRSIZE; ++jj) { + _Float16 temp = (float) rand() / (float)(RAND_MAX); + arr.push_back(temp); + } + arr_bkp = arr; + + /* call avx512 quickselect */ + for (auto _ : state) { + avx512_qselect(arr.data(), K, ARRSIZE); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } + } + else { + state.SkipWithMessage("Requires AVX512-FP16 ISA"); + } +} + +template +static void stdnthelement(benchmark::State& state) { + if (cpu_has_avx512fp16()) { + // Perform setup here + int64_t K = state.range(0); + size_t ARRSIZE = 10000; + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + for (size_t jj = 0; jj < ARRSIZE; ++jj) { + _Float16 temp = (float) rand() / (float)(RAND_MAX); + arr.push_back(temp); + } + arr_bkp = arr; + + /* call std::nth_element */ + for (auto _ : state) { + std::nth_element(arr.begin(), arr.begin() + K, arr.end()); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } + } + else { + state.SkipWithMessage("Requires AVX512-FP16 ISA"); + } +} + +// Register the function as a benchmark +BENCHMARK(avx512_qselect<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdnthelement<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); + +template +static void avx512_partial_qsort(benchmark::State& state) { + if (cpu_has_avx512fp16()) { + // Perform setup here + int64_t K = state.range(0); + size_t ARRSIZE = 10000; + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + for (size_t jj = 0; jj < ARRSIZE; ++jj) { + _Float16 temp = (float) rand() / (float)(RAND_MAX); + arr.push_back(temp); + } + arr_bkp = arr; + + /* call avx512_partial_qsort */ + for (auto _ : state) { + avx512_partial_qsort(arr.data(), K, ARRSIZE); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } + } + else { + state.SkipWithMessage("Requires AVX512-FP16 ISA"); + } +} + +template +static void stdpartialsort(benchmark::State& state) { + if (cpu_has_avx512fp16()) { + // Perform setup here + int64_t K = state.range(0); + size_t ARRSIZE = 10000; + std::vector arr; + std::vector arr_bkp; + + /* Initialize elements */ + for (size_t jj = 0; jj < ARRSIZE; ++jj) { + _Float16 temp = (float) rand() / (float)(RAND_MAX); + arr.push_back(temp); + } + arr_bkp = arr; + + /* call std::partial_sort */ + for (auto _ : state) { + std::partial_sort(arr.begin(), arr.begin() + K, arr.end()); + + state.PauseTiming(); + arr = arr_bkp; + state.ResumeTiming(); + } + } + else { + state.SkipWithMessage("Requires AVX512-FP16 ISA"); + } +} + +// Register the function as a benchmark +BENCHMARK(avx512_partial_qsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +BENCHMARK(stdpartialsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); diff --git a/meson.build b/meson.build index 79b23927..9193776d 100644 --- a/meson.build +++ b/meson.build @@ -11,8 +11,8 @@ gbench_dep = dependency('benchmark', required : true) fp16code = '''#include int main() { - __mm512h temp = _mm512_set1_ph(1.0f); - __mm512h var2 = _mm512_min_ph(temp, temp); + __m512h temp = _mm512_set1_ph(1.0f); + __m512h var2 = _mm512_min_ph(temp, temp); return 0; } ''' diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h index 7ab22123..6e0743d6 100644 --- a/src/avx512-16bit-common.h +++ b/src/avx512-16bit-common.h @@ -289,4 +289,36 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) qsort_16bit_(arr, pivot_index, right, max_iters - 1); } +template +static void +qselect_16bit_(type_t *arr, int64_t pos, + int64_t left, int64_t right, + int64_t max_iters) +{ + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1, comparison_func); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_16bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_16bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512( + arr, left, right + 1, pivot, &smallest, &biggest); + if ((pivot != smallest) && (pos < pivot_index)) + qselect_16bit_(arr, pos, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (pos >= pivot_index)) + qselect_16bit_(arr, pos, pivot_index, right, max_iters - 1); +} + #endif // AVX512_16BIT_COMMON diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index fcbaf879..606f8706 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -405,6 +405,34 @@ replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count) } } +template <> +void avx512_qselect(int16_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_16bit_, int16_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(uint16_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_16bit_, uint16_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qselect_16bit_, uint16_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + template <> void avx512_qsort(int16_t *arr, int64_t arrsize) { @@ -432,4 +460,5 @@ void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize) replace_inf_with_nan(arr, arrsize, nan_count); } } + #endif // AVX512_QSORT_16BIT diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp index 1cbba00b..e9e97aa1 100644 --- a/src/avx512-32bit-qsort.hpp +++ b/src/avx512-32bit-qsort.hpp @@ -656,6 +656,38 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) qsort_32bit_(arr, pivot_index, right, max_iters - 1); } +template +static void +qselect_32bit_(type_t *arr, int64_t pos, + int64_t left, int64_t right, + int64_t max_iters) +{ + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_32bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_32bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512( + arr, left, right + 1, pivot, &smallest, &biggest); + if ((pivot != smallest) && (pos < pivot_index)) + qselect_32bit_(arr, pos, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (pos >= pivot_index)) + qselect_32bit_(arr, pos, pivot_index, right, max_iters - 1); +} + X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) { int64_t nan_count = 0; @@ -681,6 +713,35 @@ replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count) } } +template <> +void avx512_qselect(int32_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_32bit_, int32_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(uint32_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_32bit_, uint32_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(float *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qselect_32bit_, float>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + template <> void avx512_qsort(int32_t *arr, int64_t arrsize) { diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 32a4731e..7fc8acf3 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -4,10 +4,16 @@ * Authors: Raghuveer Devulapalli * ****************************************************************/ -#ifndef AVX512_64BIT_COMMOM -#define AVX512_64BIT_COMMOM +#ifndef AVX512_64BIT_COMMON +#define AVX512_64BIT_COMMON #include "avx512-common-qsort.h" +/* + * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic + * sorting network (see + * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) + */ +// ZMM 7, 6, 5, 4, 3, 2, 1, 0 #define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3 #define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7 #define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2 diff --git a/src/avx512-64bit-qsort.hpp b/src/avx512-64bit-qsort.hpp index 62000549..dfb5376f 100644 --- a/src/avx512-64bit-qsort.hpp +++ b/src/avx512-64bit-qsort.hpp @@ -9,13 +9,6 @@ #include "avx512-64bit-common.h" -/* - * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic - * sorting network (see - * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg) - */ -// ZMM 7, 6, 5, 4, 3, 2, 1, 0 - // Assumes zmm is bitonic and performs a recursive half cleaner template X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) @@ -408,6 +401,67 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) qsort_64bit_(arr, pivot_index, right, max_iters - 1); } +template +static void +qselect_64bit_(type_t *arr, int64_t pos, + int64_t left, int64_t right, + int64_t max_iters) +{ + /* + * Resort to std::sort if quicksort isnt making any progress + */ + if (max_iters <= 0) { + std::sort(arr + left, arr + right + 1); + return; + } + /* + * Base case: use bitonic networks to sort arrays <= 128 + */ + if (right + 1 - left <= 128) { + sort_128_64bit(arr + left, (int32_t)(right + 1 - left)); + return; + } + + type_t pivot = get_pivot_64bit(arr, left, right); + type_t smallest = vtype::type_max(); + type_t biggest = vtype::type_min(); + int64_t pivot_index = partition_avx512( + arr, left, right + 1, pivot, &smallest, &biggest); + if ((pivot != smallest) && (pos < pivot_index)) + qselect_64bit_(arr, pos, left, pivot_index - 1, max_iters - 1); + else if ((pivot != biggest) && (pos >= pivot_index)) + qselect_64bit_(arr, pos, pivot_index, right, max_iters - 1); +} + +template <> +void avx512_qselect(int64_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_64bit_, int64_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(uint64_t *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + qselect_64bit_, uint64_t>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + } +} + +template <> +void avx512_qselect(double *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qselect_64bit_, double>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + template <> void avx512_qsort(int64_t *arr, int64_t arrsize) { diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index a80f2721..5b6591f0 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -88,9 +88,23 @@ struct zmm_vector; template void avx512_qsort(T *arr, int64_t arrsize); - void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize); +template +void avx512_qselect(T *arr, int64_t k, int64_t arrsize); +void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize); + +template +inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) { + avx512_qselect(arr, k - 1, arrsize); + avx512_qsort(arr, k - 1); +} +inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, int64_t arrsize) +{ + avx512_qselect_fp16(arr, k - 1, arrsize); + avx512_qsort_fp16(arr, k - 1); +} + template bool comparison_func(const T &a, const T &b) { @@ -117,8 +131,8 @@ static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask) return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max } /* - * Parition one ZMM register based on the pivot and returns the index of the - * last element that is less than equal to the pivot. + * Parition one ZMM register based on the pivot and returns the + * number of elements that are greater than or equal to the pivot. */ template static inline int32_t partition_vec(type_t *arr, @@ -129,20 +143,20 @@ static inline int32_t partition_vec(type_t *arr, zmm_t *smallest_vec, zmm_t *biggest_vec) { - /* which elements are larger than the pivot */ - typename vtype::opmask_t gt_mask = vtype::ge(curr_vec, pivot_vec); - int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask); + /* which elements are larger than or equal to the pivot */ + typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec); + int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)ge_mask); vtype::mask_compressstoreu( - arr + left, vtype::knot_opmask(gt_mask), curr_vec); + arr + left, vtype::knot_opmask(ge_mask), curr_vec); vtype::mask_compressstoreu( - arr + right - amount_gt_pivot, gt_mask, curr_vec); + arr + right - amount_ge_pivot, ge_mask, curr_vec); *smallest_vec = vtype::min(curr_vec, *smallest_vec); *biggest_vec = vtype::max(curr_vec, *biggest_vec); - return amount_gt_pivot; + return amount_ge_pivot; } /* * Parition an array based on the pivot and returns the index of the - * last element that is less than equal to the pivot. + * first element that is greater than or equal to the pivot. */ template static inline int64_t partition_avx512(type_t *arr, @@ -174,7 +188,7 @@ static inline int64_t partition_avx512(type_t *arr, if (right - left == vtype::numlanes) { zmm_t vec = vtype::loadu(arr + left); - int32_t amount_gt_pivot = partition_vec(arr, + int32_t amount_ge_pivot = partition_vec(arr, left, left + vtype::numlanes, vec, @@ -183,7 +197,7 @@ static inline int64_t partition_avx512(type_t *arr, &max_vec); *smallest = vtype::reducemin(min_vec); *biggest = vtype::reducemax(max_vec); - return left + (vtype::numlanes - amount_gt_pivot); + return left + (vtype::numlanes - amount_ge_pivot); } // first and last vtype::numlanes values are partitioned at the end @@ -211,7 +225,7 @@ static inline int64_t partition_avx512(type_t *arr, left += vtype::numlanes; } // partition the current vector and save it on both sides of the array - int32_t amount_gt_pivot + int32_t amount_ge_pivot = partition_vec(arr, l_store, r_store + vtype::numlanes, @@ -220,27 +234,27 @@ static inline int64_t partition_avx512(type_t *arr, &min_vec, &max_vec); ; - r_store -= amount_gt_pivot; - l_store += (vtype::numlanes - amount_gt_pivot); + r_store -= amount_ge_pivot; + l_store += (vtype::numlanes - amount_ge_pivot); } /* partition and save vec_left and vec_right */ - int32_t amount_gt_pivot = partition_vec(arr, + int32_t amount_ge_pivot = partition_vec(arr, l_store, r_store + vtype::numlanes, vec_left, pivot_vec, &min_vec, &max_vec); - l_store += (vtype::numlanes - amount_gt_pivot); - amount_gt_pivot = partition_vec(arr, + l_store += (vtype::numlanes - amount_ge_pivot); + amount_ge_pivot = partition_vec(arr, l_store, l_store + vtype::numlanes, vec_right, pivot_vec, &min_vec, &max_vec); - l_store += (vtype::numlanes - amount_gt_pivot); + l_store += (vtype::numlanes - amount_ge_pivot); *smallest = vtype::reducemin(min_vec); *biggest = vtype::reducemax(max_vec); return l_store; diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 363d2b55..8a9a49ed 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -144,6 +144,17 @@ replace_inf_with_nan(_Float16 *arr, int64_t arrsize, int64_t nan_count) memset(arr + arrsize - nan_count, 0xFF, nan_count * 2); } +template <> +void avx512_qselect(_Float16 *arr, int64_t k, int64_t arrsize) +{ + if (arrsize > 1) { + int64_t nan_count = replace_nan_with_inf(arr, arrsize); + qselect_16bit_, _Float16>( + arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + replace_inf_with_nan(arr, arrsize, nan_count); + } +} + template <> void avx512_qsort(_Float16 *arr, int64_t arrsize) { diff --git a/tests/meson.build b/tests/meson.build index 0a82d96f..a69222ac 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -11,7 +11,7 @@ endif if cpp.has_argument('-march=icelake-client') libtests += static_library('tests_qsort', - files('test_qsort.cpp', ), + files('test_sort.cpp', ), dependencies: gtest_dep, include_directories : [src, utils], cpp_args : ['-O3', '-march=icelake-client'], diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h new file mode 100644 index 00000000..a41b2c63 --- /dev/null +++ b/tests/test-qsort-common.h @@ -0,0 +1,11 @@ +#ifndef AVX512_TEST_COMMON +#define AVX512_TEST_COMMON + +#include "avx512-16bit-qsort.hpp" +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-qsort.hpp" +#include "cpuinfo.h" +#include "rand_array.h" +#include + +#endif diff --git a/tests/test_partial_qsort.hpp b/tests/test_partial_qsort.hpp new file mode 100644 index 00000000..5c08064e --- /dev/null +++ b/tests/test_partial_qsort.hpp @@ -0,0 +1,49 @@ +#include "test-qsort-common.h" + +template +class avx512_partial_sort : public ::testing::Test { +}; +TYPED_TEST_SUITE_P(avx512_partial_sort); + +TYPED_TEST_P(avx512_partial_sort, test_ranges) +{ + int64_t arrsize = 1024; + int64_t nranges = 500; + + if (cpu_has_avx512bw()) { + if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) { + GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; + } + std::vector arr; + std::vector sortedarr; + std::vector psortedarr; + /* Random array */ + arr = get_uniform_rand_array(arrsize); + sortedarr = arr; + /* Sort with std::sort for comparison */ + std::sort(sortedarr.begin(), sortedarr.end()); + + for (size_t ii = 0; ii < nranges; ++ii) { + psortedarr = arr; + + /* Pick a random number of elements to sort at the beginning of the array */ + int k = get_uniform_rand_array(1, arrsize, 1).front(); + + /* Sort the range and verify all the required elements match the presorted set */ + avx512_partial_qsort(psortedarr.data(), k, psortedarr.size()); + for (size_t jj = 0; jj < k; jj++) { + ASSERT_EQ(sortedarr[jj], psortedarr[jj]); + } + + psortedarr.clear(); + } + + arr.clear(); + sortedarr.clear(); + } + else { + GTEST_SKIP() << "Skipping this test, it requires avx512bw"; + } +} + +REGISTER_TYPED_TEST_SUITE_P(avx512_partial_sort, test_ranges); diff --git a/tests/test_qselect.hpp b/tests/test_qselect.hpp new file mode 100644 index 00000000..cad017bb --- /dev/null +++ b/tests/test_qselect.hpp @@ -0,0 +1,51 @@ +#include "test-qsort-common.h" + +template +class avx512_select : public ::testing::Test { +}; +TYPED_TEST_SUITE_P(avx512_select); + +TYPED_TEST_P(avx512_select, test_arrsizes) +{ + if (cpu_has_avx512bw()) { + if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) { + GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; + } + std::vector arrsizes; + for (int64_t ii = 0; ii < 1024; ++ii) { + arrsizes.push_back(ii); + } + std::vector arr; + std::vector sortedarr; + std::vector psortedarr; + for (size_t ii = 0; ii < arrsizes.size(); ++ii) { + /* Random array */ + arr = get_uniform_rand_array(arrsizes[ii]); + sortedarr = arr; + /* Sort with std::sort for comparison */ + std::sort(sortedarr.begin(), sortedarr.end()); + for (size_t k = 0; k < arr.size(); ++k) { + psortedarr = arr; + avx512_qselect(psortedarr.data(), k, psortedarr.size()); + /* index k is correct */ + ASSERT_EQ(sortedarr[k], psortedarr[k]); + /* Check left partition */ + for (size_t jj = 0; jj < k; jj++) { + ASSERT_LE(psortedarr[jj], psortedarr[k]); + } + /* Check right partition */ + for (size_t jj = k+1; jj < arr.size(); jj++) { + ASSERT_GE(psortedarr[jj], psortedarr[k]); + } + psortedarr.clear(); + } + arr.clear(); + sortedarr.clear(); + } + } + else { + GTEST_SKIP() << "Skipping this test, it requires avx512bw"; + } +} + +REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_arrsizes); diff --git a/tests/test_qsort.cpp b/tests/test_qsort.hpp similarity index 70% rename from tests/test_qsort.cpp rename to tests/test_qsort.hpp index 6d82a35b..65a8eaf6 100644 --- a/tests/test_qsort.cpp +++ b/tests/test_qsort.hpp @@ -3,13 +3,7 @@ * * SPDX-License-Identifier: BSD-3-Clause * *******************************************/ -#include "avx512-16bit-qsort.hpp" -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-qsort.hpp" -#include "cpuinfo.h" -#include "rand_array.h" -#include -#include +#include "test-qsort-common.h" template class avx512_sort : public ::testing::Test { @@ -46,13 +40,3 @@ TYPED_TEST_P(avx512_sort, test_arrsizes) } REGISTER_TYPED_TEST_SUITE_P(avx512_sort, test_arrsizes); - -using Types = testing::Types; -INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_sort, Types); diff --git a/tests/test_qsortfp16.cpp b/tests/test_qsortfp16.cpp index ab5c10fe..f86d77df 100644 --- a/tests/test_qsortfp16.cpp +++ b/tests/test_qsortfp16.cpp @@ -72,3 +72,88 @@ TEST(avx512_qsort_float16, test_special_floats) GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; } } + +TEST(avx512_qselect_float16, test_arrsizes) +{ + if (cpu_has_avx512fp16()) { + std::vector arrsizes; + for (int64_t ii = 0; ii < 1024; ++ii) { + arrsizes.push_back(ii); + } + std::vector<_Float16> arr; + std::vector<_Float16> sortedarr; + std::vector<_Float16> psortedarr; + + for (size_t ii = 0; ii < arrsizes.size(); ++ii) { + /* Random array */ + for (size_t jj = 0; jj < arrsizes[ii]; ++jj) { + _Float16 temp = (float)rand() / (float)(RAND_MAX); + arr.push_back(temp); + sortedarr.push_back(temp); + } + /* Sort with std::sort for comparison */ + std::sort(sortedarr.begin(), sortedarr.end()); + for (size_t k = 0; k < arr.size(); ++k) { + psortedarr = arr; + avx512_qselect<_Float16>(psortedarr.data(), k, psortedarr.size()); + /* index k is correct */ + ASSERT_EQ(sortedarr[k], psortedarr[k]); + /* Check left partition */ + for (size_t jj = 0; jj < k; jj++) { + ASSERT_LE(psortedarr[jj], psortedarr[k]); + } + /* Check right partition */ + for (size_t jj = k+1; jj < arr.size(); jj++) { + ASSERT_GE(psortedarr[jj], psortedarr[k]); + } + psortedarr.clear(); + } + arr.clear(); + sortedarr.clear(); + } + } + else { + GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; + } +} + +TEST(avx512_partial_qsort_float16, test_ranges) +{ + if (cpu_has_avx512fp16()) { + int64_t arrsize = 1024; + int64_t nranges = 500; + + std::vector<_Float16> arr; + std::vector<_Float16> sortedarr; + std::vector<_Float16> psortedarr; + + /* Random array */ + for (size_t ii = 0; ii < arrsize; ++ii) { + _Float16 temp = (float)rand() / (float)(RAND_MAX); + arr.push_back(temp); + sortedarr.push_back(temp); + } + /* Sort with std::sort for comparison */ + std::sort(sortedarr.begin(), sortedarr.end()); + + for (size_t ii = 0; ii < nranges; ++ii) { + psortedarr = arr; + + int k = get_uniform_rand_array(1, arrsize, 1).front(); + + /* Sort the range and verify all the required elements match the presorted set */ + avx512_partial_qsort<_Float16>(psortedarr.data(), k, psortedarr.size()); + for (size_t jj = 0; jj < k; jj++) { + ASSERT_EQ(sortedarr[jj], psortedarr[jj]); + } + + psortedarr.clear(); + } + + arr.clear(); + sortedarr.clear(); + } + else { + GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; + } +} diff --git a/tests/test_sort.cpp b/tests/test_sort.cpp new file mode 100644 index 00000000..85a6bd8d --- /dev/null +++ b/tests/test_sort.cpp @@ -0,0 +1,15 @@ +#include "test_qsort.hpp" +#include "test_qselect.hpp" +#include "test_partial_qsort.hpp" + +using QuickSortTestTypes = testing::Types; +INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_sort, QuickSortTestTypes); +INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_select, QuickSortTestTypes); +INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_partial_sort, QuickSortTestTypes);