diff --git a/.github/workflows/build-numpy.yml b/.github/workflows/build-numpy.yml index cda490f9..98a01db7 100644 --- a/.github/workflows/build-numpy.yml +++ b/.github/workflows/build-numpy.yml @@ -11,7 +11,7 @@ on: jobs: NumPyMultiarrayTests: - runs-on: ubuntu-latest + runs-on: intel-ubuntu-latest steps: - name: Checkout x86-simd-sort diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 67708ef1..762a24d2 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -7,9 +7,9 @@ on: branches: [ "main" ] jobs: - ICX: + SKL: - runs-on: ubuntu-latest + runs-on: intel-ubuntu-latest steps: - uses: actions/checkout@v3 @@ -17,7 +17,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt -y install g++-10 libgtest-dev meson curl git cmake + sudo apt -y install g++-13 libgtest-dev meson curl git cmake - name: Install google benchmarks run: | @@ -29,25 +29,25 @@ jobs: - name: Install Intel SDE run: | - curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/732268/sde-external-9.7.0-2022-05-09-lin.tar.xz + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde - name: Build env: - CXX: g++-10 + CXX: g++-13 run: | make clean - meson setup --warnlevel 2 --werror --buildtype plain builddir + meson setup --warnlevel 2 --werror --buildtype release builddir cd builddir ninja - - name: Run test suite on ICX - run: sde -icx -- ./builddir/testexe + - name: Run test suite on SKL + run: sde -skl -- ./builddir/testexe - SPR: + SKX: - runs-on: ubuntu-latest + runs-on: intel-ubuntu-latest steps: - uses: actions/checkout@v3 @@ -55,7 +55,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt -y install g++-12 libgtest-dev meson curl git cmake + sudo apt -y install g++-13 libgtest-dev meson curl git cmake - name: Install google benchmarks run: | @@ -67,58 +67,93 @@ jobs: - name: Install Intel SDE run: | - curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/732268/sde-external-9.7.0-2022-05-09-lin.tar.xz + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde - name: Build env: - CXX: g++-12 + CXX: g++-13 run: | make clean - meson setup --warnlevel 2 --werror --buildtype plain builddir + meson setup --warnlevel 2 --werror --buildtype release builddir cd builddir ninja - - name: Run _Float16 test suite on SPR - run: sde -spr -- ./builddir/testexe --gtest_filter="*float16*" + - name: Run test suite on SKX + run: sde -skx -- ./builddir/testexe - compare-benchmarks-with-main: - if: ${{ false }} # disable for now + TGL: - runs-on: ubuntu-latest + runs-on: intel-ubuntu-latest steps: - uses: actions/checkout@v3 - with: - fetch-depth: 0 - path: x86-simd-sort - - name: Specify branch name - working-directory: ${{ github.workspace }}/x86-simd-sort - run: git switch -c pr-branch + - name: Install dependencies + run: | + sudo apt update + sudo apt -y install g++-13 libgtest-dev meson curl git cmake + + - name: Install google benchmarks + run: | + git clone https://github.com/google/benchmark.git + cd benchmark + cmake -E make_directory "build" + cmake -E chdir "build" cmake -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_BUILD_TYPE=Release ../ + sudo cmake --build "build" --config Release --target install + + - name: Install Intel SDE + run: | + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz + mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ + sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde + + - name: Build + env: + CXX: g++-13 + run: | + make clean + meson setup --warnlevel 2 --werror --buildtype release builddir + cd builddir + ninja + - name: Run test suite on TGL + run: sde -tgl -- ./builddir/testexe + + SPR: - - uses: actions/setup-python@v4 - with: - python-version: '3.9' + runs-on: intel-ubuntu-latest + + steps: + - uses: actions/checkout@v3 - name: Install dependencies run: | sudo apt update - sudo apt -y install g++-12 libgtest-dev meson curl git cmake + sudo apt -y install g++-13 libgtest-dev meson curl git cmake - name: Install google benchmarks run: | git clone https://github.com/google/benchmark.git cd benchmark - pip3 install -r tools/requirements.txt cmake -E make_directory "build" cmake -E chdir "build" cmake -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_BUILD_TYPE=Release ../ sudo cmake --build "build" --config Release --target install - - name: Run bench-compare - working-directory: ${{ github.workspace }}/x86-simd-sort + - name: Install Intel SDE + run: | + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz + mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ + sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde + + - name: Build env: - CXX: g++-12 - GBENCH: ${{ github.workspace }}/benchmark - run: bash -x scripts/branch-compare.sh avx + CXX: g++-13 + run: | + make clean + meson setup --warnlevel 2 --werror --buildtype release builddir + cd builddir + ninja + + - name: Run test suite on SPR + run: sde -spr -- ./builddir/testexe diff --git a/Makefile b/Makefile index 27302673..f25c8dad 100644 --- a/Makefile +++ b/Makefile @@ -75,9 +75,14 @@ benchexe: $(BENCHOBJS) $(UTILOBJS) .PHONY: meson meson: - meson setup --warnlevel 2 --werror --buildtype plain builddir + meson setup --warnlevel 2 --werror --buildtype release builddir cd builddir && ninja +.PHONY: mesondebug +mesondebug: + meson setup --warnlevel 2 --werror --buildtype debug debug + cd debug && ninja + .PHONY: clean clean: $(RM) -rf $(TESTOBJS) $(BENCHOBJS) $(UTILOBJS) testexe benchexe builddir diff --git a/_clang-format b/_clang-format index 30f08064..98760584 100644 --- a/_clang-format +++ b/_clang-format @@ -63,7 +63,7 @@ KeepEmptyLinesAtTheStartOfBlocks: true MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None +NamespaceIndentation: Inner PenaltyBreakAssignment: 2 PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 diff --git a/benchmarks/bench-qsort-common.h b/benchmarks/bench-all.cpp similarity index 58% rename from benchmarks/bench-qsort-common.h rename to benchmarks/bench-all.cpp index 60792618..23fc17a0 100644 --- a/benchmarks/bench-qsort-common.h +++ b/benchmarks/bench-all.cpp @@ -1,12 +1,5 @@ -#ifndef AVX512_BENCH_COMMON -#define AVX512_BENCH_COMMON - -#include "avx512-16bit-qsort.hpp" -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-argsort.hpp" -#include "avx512-64bit-qsort.hpp" - #include "rand_array.h" +#include "x86simdsort.h" #include #define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \ @@ -18,11 +11,15 @@ func(st, __VA_ARGS__); \ }))) -#define BENCH(func, type) \ - MY_BENCHMARK_CAPTURE(func, type, smallrandom_128, 128, std::string("random")); \ - MY_BENCHMARK_CAPTURE(func, type, smallrandom_256, 256, std::string("random")); \ - MY_BENCHMARK_CAPTURE(func, type, smallrandom_512, 512, std::string("random")); \ - MY_BENCHMARK_CAPTURE(func, type, smallrandom_1k, 1024, std::string("random")); \ +#define BENCH_SORT(func, type) \ + MY_BENCHMARK_CAPTURE( \ + func, type, smallrandom_128, 128, std::string("random")); \ + MY_BENCHMARK_CAPTURE( \ + func, type, smallrandom_256, 256, std::string("random")); \ + MY_BENCHMARK_CAPTURE( \ + func, type, smallrandom_512, 512, std::string("random")); \ + MY_BENCHMARK_CAPTURE( \ + func, type, smallrandom_1k, 1024, std::string("random")); \ MY_BENCHMARK_CAPTURE(func, type, random_5k, 5000, std::string("random")); \ MY_BENCHMARK_CAPTURE( \ func, type, random_100k, 100000, std::string("random")); \ @@ -37,4 +34,13 @@ MY_BENCHMARK_CAPTURE( \ func, type, reverse_10k, 10000, std::string("reverse")); -#endif +#define BENCH_PARTIAL(func, type) \ + MY_BENCHMARK_CAPTURE(func, type, k10, 10000, 10); \ + MY_BENCHMARK_CAPTURE(func, type, k100, 10000, 100); \ + MY_BENCHMARK_CAPTURE(func, type, k1000, 10000, 1000); \ + MY_BENCHMARK_CAPTURE(func, type, k5000, 10000, 5000); + +#include "bench-argsort.hpp" +#include "bench-partial-qsort.hpp" +#include "bench-qselect.hpp" +#include "bench-qsort.hpp" diff --git a/benchmarks/bench-argsort.hpp b/benchmarks/bench-argsort.hpp index 905fb581..0546d7c4 100644 --- a/benchmarks/bench-argsort.hpp +++ b/benchmarks/bench-argsort.hpp @@ -1,13 +1,11 @@ -#include "bench-qsort-common.h" - template -std::vector stdargsort(const std::vector &array) +std::vector stdargsort(const std::vector &array) { - std::vector indices(array.size()); + std::vector indices(array.size()); std::iota(indices.begin(), indices.end(), 0); std::sort(indices.begin(), indices.end(), - [&array](int64_t left, int64_t right) -> bool { + [&array](size_t left, size_t right) -> bool { // sort indices according to corresponding array element return array[left] < array[right]; }); @@ -16,77 +14,40 @@ std::vector stdargsort(const std::vector &array) } template -static void stdargsort(benchmark::State &state, Args &&...args) +static void scalarargsort(benchmark::State &state, Args &&...args) { + // get args auto args_tuple = std::make_tuple(std::move(args)...); - // Perform setup here - size_t ARRSIZE = std::get<0>(args_tuple); - std::vector arr; - std::vector inx; - + size_t arrsize = std::get<0>(args_tuple); std::string arrtype = std::get<1>(args_tuple); - if (arrtype == "random") { arr = get_uniform_rand_array(ARRSIZE); } - else if (arrtype == "sorted") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - } - else if (arrtype == "constant") { - T temp = get_uniform_rand_array(1)[0]; - for (size_t ii = 0; ii < ARRSIZE; ++ii) { - arr.push_back(temp); - } - } - else if (arrtype == "reverse") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - std::reverse(arr.begin(), arr.end()); - } - - /* call avx512 quicksort */ + // set up array + std::vector arr = get_array(arrtype, arrsize); + std::vector inx; + // benchmark for (auto _ : state) { inx = stdargsort(arr); } } template -static void avx512argsort(benchmark::State &state, Args &&...args) +static void simdargsort(benchmark::State &state, Args &&...args) { + // get args auto args_tuple = std::make_tuple(std::move(args)...); - if (!__builtin_cpu_supports("avx512bw")) { - state.SkipWithMessage("Requires AVX512 BW ISA"); - } - // Perform setup here - size_t ARRSIZE = std::get<0>(args_tuple); - std::vector arr; - std::vector inx; - + size_t arrsize = std::get<0>(args_tuple); std::string arrtype = std::get<1>(args_tuple); - if (arrtype == "random") { arr = get_uniform_rand_array(ARRSIZE); } - else if (arrtype == "sorted") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - } - else if (arrtype == "constant") { - T temp = get_uniform_rand_array(1)[0]; - for (size_t ii = 0; ii < ARRSIZE; ++ii) { - arr.push_back(temp); - } - } - else if (arrtype == "reverse") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - std::reverse(arr.begin(), arr.end()); - } - - /* call avx512 quicksort */ + // set up array + std::vector arr = get_array(arrtype, arrsize); + std::vector inx; + // benchmark for (auto _ : state) { - inx = avx512_argsort(arr.data(), ARRSIZE); + inx = x86simdsort::argsort(arr.data(), arrsize); } } #define BENCH_BOTH(type) \ - BENCH(avx512argsort, type) \ - BENCH(stdargsort, type) + BENCH_SORT(simdargsort, type) \ + BENCH_SORT(scalarargsort, type) BENCH_BOTH(int64_t) BENCH_BOTH(uint64_t) diff --git a/benchmarks/bench-partial-qsort.hpp b/benchmarks/bench-partial-qsort.hpp index c5091392..77663d39 100644 --- a/benchmarks/bench-partial-qsort.hpp +++ b/benchmarks/bench-partial-qsort.hpp @@ -1,17 +1,10 @@ -#include "bench-qsort-common.h" - -template -static void avx512_partial_qsort(benchmark::State &state) +template +static void simdpartialsort(benchmark::State &state, Args &&...args) { - if (!__builtin_cpu_supports("avx512bw")) { - state.SkipWithMessage("Requires AVX512 BW ISA"); - } - if ((sizeof(T) == 2) && (!__builtin_cpu_supports("avx512vbmi2"))) { - state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); - } // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; + auto args_tuple = std::make_tuple(std::move(args)...); + int64_t ARRSIZE = std::get<0>(args_tuple); + int64_t k = std::get<1>(args_tuple); std::vector arr; std::vector arr_bkp; @@ -19,9 +12,9 @@ static void avx512_partial_qsort(benchmark::State &state) arr = get_uniform_rand_array(ARRSIZE); arr_bkp = arr; - /* call avx512_partial_qsort */ + /* call simdpartialsort */ for (auto _ : state) { - avx512_partial_qsort(arr.data(), K, ARRSIZE); + x86simdsort::partial_qsort(arr.data(), k, ARRSIZE); state.PauseTiming(); arr = arr_bkp; @@ -29,12 +22,13 @@ static void avx512_partial_qsort(benchmark::State &state) } } -template -static void stdpartialsort(benchmark::State &state) +template +static void scalarpartialsort(benchmark::State &state, Args &&...args) { // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; + auto args_tuple = std::make_tuple(std::move(args)...); + int64_t ARRSIZE = std::get<0>(args_tuple); + int64_t k = std::get<1>(args_tuple); std::vector arr; std::vector arr_bkp; @@ -44,7 +38,7 @@ static void stdpartialsort(benchmark::State &state) /* call std::partial_sort */ for (auto _ : state) { - std::partial_sort(arr.begin(), arr.begin() + K, arr.end()); + std::partial_sort(arr.begin(), arr.begin() + k, arr.end()); state.PauseTiming(); arr = arr_bkp; @@ -52,51 +46,18 @@ static void stdpartialsort(benchmark::State &state) } } -// Register the function as a benchmark -BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); - -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); - -//BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +#define BENCH_BOTH_PARTIAL(type) \ + BENCH_PARTIAL(simdpartialsort, type) \ + BENCH_PARTIAL(scalarpartialsort, type) + +BENCH_BOTH_PARTIAL(uint64_t) +BENCH_BOTH_PARTIAL(int64_t) +BENCH_BOTH_PARTIAL(uint32_t) +BENCH_BOTH_PARTIAL(int32_t) +BENCH_BOTH_PARTIAL(uint16_t) +BENCH_BOTH_PARTIAL(int16_t) +BENCH_BOTH_PARTIAL(float) +BENCH_BOTH_PARTIAL(double) +#ifdef __FLT16_MAX__ +BENCH_BOTH_PARTIAL(_Float16) +#endif diff --git a/benchmarks/bench-qselect.hpp b/benchmarks/bench-qselect.hpp index af3c401a..0dab181e 100644 --- a/benchmarks/bench-qselect.hpp +++ b/benchmarks/bench-qselect.hpp @@ -1,17 +1,10 @@ -#include "bench-qsort-common.h" - -template -static void avx512_qselect(benchmark::State &state) +template +static void simdqselect(benchmark::State &state, Args &&...args) { - if (!__builtin_cpu_supports("avx512bw")) { - state.SkipWithMessage("Requires AVX512 BW ISA"); - } - if ((sizeof(T) == 2) && (!__builtin_cpu_supports("avx512vbmi2"))) { - state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); - } // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; + auto args_tuple = std::make_tuple(std::move(args)...); + int64_t ARRSIZE = std::get<0>(args_tuple); + int64_t k = std::get<1>(args_tuple); std::vector arr; std::vector arr_bkp; @@ -21,7 +14,7 @@ static void avx512_qselect(benchmark::State &state) /* call avx512 quickselect */ for (auto _ : state) { - avx512_qselect(arr.data(), K, ARRSIZE); + x86simdsort::qselect(arr.data(), k, ARRSIZE); state.PauseTiming(); arr = arr_bkp; @@ -29,12 +22,13 @@ static void avx512_qselect(benchmark::State &state) } } -template -static void stdnthelement(benchmark::State &state) +template +static void scalarqselect(benchmark::State &state, Args &&...args) { // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; + auto args_tuple = std::make_tuple(std::move(args)...); + int64_t ARRSIZE = std::get<0>(args_tuple); + int64_t k = std::get<1>(args_tuple); std::vector arr; std::vector arr_bkp; @@ -44,7 +38,7 @@ static void stdnthelement(benchmark::State &state) /* call std::nth_element */ for (auto _ : state) { - std::nth_element(arr.begin(), arr.begin() + K, arr.end()); + std::nth_element(arr.begin(), arr.begin() + k, arr.end()); state.PauseTiming(); arr = arr_bkp; @@ -52,23 +46,18 @@ static void stdnthelement(benchmark::State &state) } } -// Register the function as a benchmark -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); - -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); - -//BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +#define BENCH_BOTH_QSELECT(type) \ + BENCH_PARTIAL(simdqselect, type) \ + BENCH_PARTIAL(scalarqselect, type) + +BENCH_BOTH_QSELECT(uint64_t) +BENCH_BOTH_QSELECT(int64_t) +BENCH_BOTH_QSELECT(uint32_t) +BENCH_BOTH_QSELECT(int32_t) +BENCH_BOTH_QSELECT(uint16_t) +BENCH_BOTH_QSELECT(int16_t) +BENCH_BOTH_QSELECT(float) +BENCH_BOTH_QSELECT(double) +#ifdef __FLT16_MAX__ +BENCH_BOTH_QSELECT(_Float16) +#endif diff --git a/benchmarks/bench-qsort.cpp b/benchmarks/bench-qsort.cpp deleted file mode 100644 index 97e78ffc..00000000 --- a/benchmarks/bench-qsort.cpp +++ /dev/null @@ -1,4 +0,0 @@ -#include "bench-qsort.hpp" -#include "bench-argsort.hpp" -#include "bench-partial-qsort.hpp" -#include "bench-qselect.hpp" diff --git a/benchmarks/bench-qsort.hpp b/benchmarks/bench-qsort.hpp index 3b03b1da..f95b05ba 100644 --- a/benchmarks/bench-qsort.hpp +++ b/benchmarks/bench-qsort.hpp @@ -1,34 +1,14 @@ -#include "bench-qsort-common.h" - template -static void stdsort(benchmark::State &state, Args &&...args) +static void scalarsort(benchmark::State &state, Args &&...args) { + // Get args auto args_tuple = std::make_tuple(std::move(args)...); - // Perform setup here - size_t ARRSIZE = std::get<0>(args_tuple); - std::vector arr; - std::vector arr_bkp; - + size_t arrsize = std::get<0>(args_tuple); std::string arrtype = std::get<1>(args_tuple); - if (arrtype == "random") { arr = get_uniform_rand_array(ARRSIZE); } - else if (arrtype == "sorted") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - } - else if (arrtype == "constant") { - T temp = get_uniform_rand_array(1)[0]; - for (size_t ii = 0; ii < ARRSIZE; ++ii) { - arr.push_back(temp); - } - } - else if (arrtype == "reverse") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - std::reverse(arr.begin(), arr.end()); - } - arr_bkp = arr; - - /* call avx512 quicksort */ + // set up array + std::vector arr = get_array(arrtype, arrsize); + std::vector arr_bkp = arr; + // benchmark for (auto _ : state) { std::sort(arr.begin(), arr.end()); state.PauseTiming(); @@ -38,42 +18,18 @@ static void stdsort(benchmark::State &state, Args &&...args) } template -static void avx512qsort(benchmark::State &state, Args &&...args) +static void simdsort(benchmark::State &state, Args &&...args) { + // Get args auto args_tuple = std::make_tuple(std::move(args)...); - if (!__builtin_cpu_supports("avx512bw")) { - state.SkipWithMessage("Requires AVX512 BW ISA"); - } - if ((sizeof(T) == 2) && (!__builtin_cpu_supports("avx512vbmi2"))) { - state.SkipWithMessage("Requires AVX512 VBMI2"); - } - // Perform setup here - size_t ARRSIZE = std::get<0>(args_tuple); - std::vector arr; - std::vector arr_bkp; - + size_t arrsize = std::get<0>(args_tuple); std::string arrtype = std::get<1>(args_tuple); - if (arrtype == "random") { arr = get_uniform_rand_array(ARRSIZE); } - else if (arrtype == "sorted") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - } - else if (arrtype == "constant") { - T temp = get_uniform_rand_array(1)[0]; - for (size_t ii = 0; ii < ARRSIZE; ++ii) { - arr.push_back(temp); - } - } - else if (arrtype == "reverse") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - std::reverse(arr.begin(), arr.end()); - } - arr_bkp = arr; - - /* call avx512 quicksort */ + // set up array + std::vector arr = get_array(arrtype, arrsize); + std::vector arr_bkp = arr; + // benchmark for (auto _ : state) { - avx512_qsort(arr.data(), ARRSIZE); + x86simdsort::qsort(arr.data(), arrsize); state.PauseTiming(); arr = arr_bkp; state.ResumeTiming(); @@ -81,8 +37,8 @@ static void avx512qsort(benchmark::State &state, Args &&...args) } #define BENCH_BOTH_QSORT(type) \ - BENCH(avx512qsort, type) \ - BENCH(stdsort, type) + BENCH_SORT(simdsort, type) \ + BENCH_SORT(scalarsort, type) BENCH_BOTH_QSORT(uint64_t) BENCH_BOTH_QSORT(int64_t) @@ -92,3 +48,6 @@ BENCH_BOTH_QSORT(uint16_t) BENCH_BOTH_QSORT(int16_t) BENCH_BOTH_QSORT(float) BENCH_BOTH_QSORT(double) +#ifdef __FLT16_MAX__ +BENCH_BOTH_QSORT(_Float16) +#endif diff --git a/benchmarks/bench-qsortfp16.cpp b/benchmarks/bench-qsortfp16.cpp deleted file mode 100644 index 769c2c2f..00000000 --- a/benchmarks/bench-qsortfp16.cpp +++ /dev/null @@ -1,201 +0,0 @@ -#include "avx512fp16-16bit-qsort.hpp" - -#include "rand_array.h" -#include - -template -static void avx512_qsort(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - size_t ARRSIZE = state.range(0); - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements */ - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call avx512 quicksort */ - for (auto _ : state) { - avx512_qsort(arr.data(), ARRSIZE); - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -template -static void stdsort(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - size_t ARRSIZE = state.range(0); - std::vector arr; - std::vector arr_bkp; - - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call std::sort */ - for (auto _ : state) { - std::sort(arr.begin(), arr.end()); - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -// Register the function as a benchmark -BENCHMARK(avx512_qsort<_Float16>)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort<_Float16>)->Arg(10000)->Arg(1000000); - -template -static void avx512_qselect(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements */ - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call avx512 quickselect */ - for (auto _ : state) { - avx512_qselect(arr.data(), K, ARRSIZE); - - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -template -static void stdnthelement(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements */ - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call std::nth_element */ - for (auto _ : state) { - std::nth_element(arr.begin(), arr.begin() + K, arr.end()); - - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -// Register the function as a benchmark -BENCHMARK(avx512_qselect<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); - -template -static void avx512_partial_qsort(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements */ - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call avx512_partial_qsort */ - for (auto _ : state) { - avx512_partial_qsort(arr.data(), K, ARRSIZE); - - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -template -static void stdpartialsort(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements */ - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call std::partial_sort */ - for (auto _ : state) { - std::partial_sort(arr.begin(), arr.begin() + K, arr.end()); - - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -// Register the function as a benchmark -BENCHMARK(avx512_partial_qsort<_Float16>) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); diff --git a/benchmarks/bench-tgl.out b/benchmarks/bench-tgl.out deleted file mode 100644 index 1bb03936..00000000 --- a/benchmarks/bench-tgl.out +++ /dev/null @@ -1,28 +0,0 @@ -|-----------------+-------------+------------+-----------------+-----------+----------| -| Array data type | typeid name | array size | avx512_qsort | std::sort | speed up | -|-----------------+-------------+------------+-----------------+-----------+----------| -| uniform random | uint32_t | 10000 | 115697 | 1579118 | 13.6 | -| uniform random | uint32_t | 100000 | 1786812 | 19973203 | 11.2 | -| uniform random | uint32_t | 1000000 | 22536966 | 233470422 | 10.4 | -| uniform random | int32_t | 10000 | 95591 | 1569108 | 16.4 | -| uniform random | int32_t | 100000 | 1790362 | 19785007 | 11.1 | -| uniform random | int32_t | 1000000 | 22874571 | 233358497 | 10.2 | -| uniform random | float | 10000 | 113316 | 1668407 | 14.7 | -| uniform random | float | 100000 | 1920018 | 21815024 | 11.4 | -| uniform random | float | 1000000 | 24776954 | 256867990 | 10.4 | -| uniform random | uint64_t | 10000 | 233501 | 1537649 | 6.6 | -| uniform random | uint64_t | 100000 | 3991372 | 19559859 | 4.9 | -| uniform random | uint64_t | 1000000 | 49818870 | 232687666 | 4.7 | -| uniform random | int64_t | 10000 | 228000 | 1445131 | 6.3 | -| uniform random | int64_t | 100000 | 3892092 | 18917322 | 4.9 | -| uniform random | int64_t | 1000000 | 48957088 | 235100259 | 4.8 | -| uniform random | double | 10000 | 180307 | 1702801 | 9.4 | -| uniform random | double | 100000 | 3596886 | 21849587 | 6.1 | -| uniform random | double | 1000000 | 47724381 | 258014177 | 5.4 | -| uniform random | uint16_t | 10000 | 84732 | 1548275 | 18.3 | -| uniform random | uint16_t | 100000 | 1406417 | 19632858 | 14.0 | -| uniform random | uint16_t | 1000000 | 17119960 | 214085305 | 12.5 | -| uniform random | int16_t | 10000 | 84703 | 1547726 | 18.3 | -| uniform random | int16_t | 100000 | 1442726 | 19705242 | 13.7 | -| uniform random | int16_t | 1000000 | 20210224 | 212137465 | 10.5 | -|-----------------+-------------+------------+-----------------+-----------+----------| \ No newline at end of file diff --git a/benchmarks/meson.build b/benchmarks/meson.build index d7b62b07..fe126f15 100644 --- a/benchmarks/meson.build +++ b/benchmarks/meson.build @@ -1,19 +1,10 @@ libbench = [] -if cpp.has_argument('-march=icelake-client') - libbench += static_library('bench_qsort', - files('bench-qsort.cpp', ), - dependencies: gbench_dep, - include_directories : [src, utils], - cpp_args : ['-O3', '-march=icelake-client'], - ) -endif - -if cancompilefp16 - libbench += static_library('bench_qsortfp16', - files('bench-qsortfp16.cpp', ), - dependencies: gbench_dep, - include_directories : [src, utils], - cpp_args : ['-O3', '-march=sapphirerapids'], - ) -endif +libbench += static_library('bench_qsort', + files( + 'bench-all.cpp', + ), + dependencies: gbench_dep, + include_directories : [src, lib, utils], + cpp_args : ['-O3'], + ) diff --git a/lib/meson.build b/lib/meson.build new file mode 100644 index 00000000..fc544701 --- /dev/null +++ b/lib/meson.build @@ -0,0 +1,31 @@ +libtargets = [] + +if cpp.has_argument('-march=skylake-avx512') + libtargets += static_library('libskx', + files( + 'x86simdsort-skx.cpp', + ), + include_directories : [src], + cpp_args : ['-march=skylake-avx512', flags_hide_symbols], + ) +endif + +if cpp.has_argument('-march=icelake-client') + libtargets += static_library('libicl', + files( + 'x86simdsort-icl.cpp', + ), + include_directories : [src], + cpp_args : ['-march=icelake-client', flags_hide_symbols], + ) +endif + +if cancompilefp16 + libtargets += static_library('libspr', + files( + 'x86simdsort-spr.cpp', + ), + include_directories : [src], + cpp_args : ['-march=sapphirerapids', flags_hide_symbols], + ) +endif diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp new file mode 100644 index 00000000..2aa3a575 --- /dev/null +++ b/lib/x86simdsort-icl.cpp @@ -0,0 +1,38 @@ +// ICL specific routines: +#include "avx512-16bit-qsort.hpp" +#include "x86simdsort-internal.h" + +namespace xss { +namespace avx512 { + template <> + void qsort(uint16_t *arr, size_t size) + { + avx512_qsort(arr, size); + } + template <> + void qselect(uint16_t *arr, size_t k, size_t arrsize, bool hasnan) + { + avx512_qselect(arr, k, arrsize, hasnan); + } + template <> + void partial_qsort(uint16_t *arr, size_t k, size_t arrsize, bool hasnan) + { + avx512_partial_qsort(arr, k, arrsize, hasnan); + } + template <> + void qsort(int16_t *arr, size_t size) + { + avx512_qsort(arr, size); + } + template <> + void qselect(int16_t *arr, size_t k, size_t arrsize, bool hasnan) + { + avx512_qselect(arr, k, arrsize, hasnan); + } + template <> + void partial_qsort(int16_t *arr, size_t k, size_t arrsize, bool hasnan) + { + avx512_partial_qsort(arr, k, arrsize, hasnan); + } +} // namespace avx512 +} // namespace xss diff --git a/lib/x86simdsort-internal.h b/lib/x86simdsort-internal.h new file mode 100644 index 00000000..7e716e8d --- /dev/null +++ b/lib/x86simdsort-internal.h @@ -0,0 +1,69 @@ +#ifndef XSS_INTERNAL_METHODS +#define XSS_INTERNAL_METHODS +#include "x86simdsort.h" +#include +#include + +namespace xss { +namespace avx512 { + // quicksort + template + XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize); + // quickselect + template + XSS_HIDE_SYMBOL void + qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); + // partial sort + template + XSS_HIDE_SYMBOL void + partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); + // argsort + template + XSS_HIDE_SYMBOL std::vector argsort(T *arr, size_t arrsize); + // argselect + template + XSS_HIDE_SYMBOL std::vector + argselect(T *arr, size_t k, size_t arrsize); +} // namespace avx512 +namespace avx2 { + // quicksort + template + XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize); + // quickselect + template + XSS_HIDE_SYMBOL void + qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); + // partial sort + template + XSS_HIDE_SYMBOL void + partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); + // argsort + template + XSS_HIDE_SYMBOL std::vector argsort(T *arr, size_t arrsize); + // argselect + template + XSS_HIDE_SYMBOL std::vector + argselect(T *arr, size_t k, size_t arrsize); +} // namespace avx2 +namespace scalar { + // quicksort + template + XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize); + // quickselect + template + XSS_HIDE_SYMBOL void + qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); + // partial sort + template + XSS_HIDE_SYMBOL void + partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); + // argsort + template + XSS_HIDE_SYMBOL std::vector argsort(T *arr, size_t arrsize); + // argselect + template + XSS_HIDE_SYMBOL std::vector + argselect(T *arr, size_t k, size_t arrsize); +} // namespace scalar +} // namespace xss +#endif diff --git a/lib/x86simdsort-scalar.h b/lib/x86simdsort-scalar.h new file mode 100644 index 00000000..6e8d67bf --- /dev/null +++ b/lib/x86simdsort-scalar.h @@ -0,0 +1,55 @@ +#include "custom-compare.h" +#include +#include + +namespace xss { +namespace scalar { + template + void qsort(T *arr, size_t arrsize) + { + std::sort(arr, arr + arrsize, compare>()); + } + template + void qselect(T *arr, size_t k, size_t arrsize, bool hasnan) + { + if (hasnan) { + std::nth_element( + arr, arr + k, arr + arrsize, compare>()); + } + else { + std::nth_element(arr, arr + k, arr + arrsize); + } + } + template + void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan) + { + if (hasnan) { + std::partial_sort( + arr, arr + k, arr + arrsize, compare>()); + } + else { + std::partial_sort(arr, arr + k, arr + arrsize); + } + } + template + std::vector argsort(T *arr, size_t arrsize) + { + std::vector arg(arrsize); + std::iota(arg.begin(), arg.end(), 0); + std::sort(arg.begin(), arg.end(), compare_arg>(arr)); + return arg; + } + template + std::vector argselect(T *arr, size_t k, size_t arrsize) + { + std::vector arg(arrsize); + std::iota(arg.begin(), arg.end(), 0); + std::nth_element(arg.begin(), + arg.begin() + k, + arg.end(), + compare_arg>(arr)); + return arg; + } + +} // namespace scalar +} // namespace xss diff --git a/lib/x86simdsort-skx.cpp b/lib/x86simdsort-skx.cpp new file mode 100644 index 00000000..4ebb9c11 --- /dev/null +++ b/lib/x86simdsort-skx.cpp @@ -0,0 +1,43 @@ +// SKX specific routines: +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-argsort.hpp" +#include "avx512-64bit-qsort.hpp" +#include "x86simdsort-internal.h" + +#define DEFINE_ALL_METHODS(type) \ + template <> \ + void qsort(type *arr, size_t arrsize) \ + { \ + avx512_qsort(arr, arrsize); \ + } \ + template <> \ + void qselect(type *arr, size_t k, size_t arrsize, bool hasnan) \ + { \ + avx512_qselect(arr, k, arrsize, hasnan); \ + } \ + template <> \ + void partial_qsort(type *arr, size_t k, size_t arrsize, bool hasnan) \ + { \ + avx512_partial_qsort(arr, k, arrsize, hasnan); \ + } \ + template <> \ + std::vector argsort(type *arr, size_t arrsize) \ + { \ + return avx512_argsort(arr, arrsize); \ + } \ + template <> \ + std::vector argselect(type *arr, size_t k, size_t arrsize) \ + { \ + return avx512_argselect(arr, k, arrsize); \ + } + +namespace xss { +namespace avx512 { + DEFINE_ALL_METHODS(uint32_t) + DEFINE_ALL_METHODS(int32_t) + DEFINE_ALL_METHODS(float) + DEFINE_ALL_METHODS(uint64_t) + DEFINE_ALL_METHODS(int64_t) + DEFINE_ALL_METHODS(double) +} // namespace avx512 +} // namespace xss diff --git a/lib/x86simdsort-spr.cpp b/lib/x86simdsort-spr.cpp new file mode 100644 index 00000000..4672bcb8 --- /dev/null +++ b/lib/x86simdsort-spr.cpp @@ -0,0 +1,23 @@ +// SPR specific routines: +#include "avx512fp16-16bit-qsort.hpp" +#include "x86simdsort-internal.h" + +namespace xss { +namespace avx512 { + template <> + void qsort(_Float16 *arr, size_t size) + { + avx512_qsort(arr, size); + } + template <> + void qselect(_Float16 *arr, size_t k, size_t arrsize, bool hasnan) + { + avx512_qselect(arr, k, arrsize, hasnan); + } + template <> + void partial_qsort(_Float16 *arr, size_t k, size_t arrsize, bool hasnan) + { + avx512_partial_qsort(arr, k, arrsize, hasnan); + } +} // namespace avx512 +} // namespace xss diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp new file mode 100644 index 00000000..091d717a --- /dev/null +++ b/lib/x86simdsort.cpp @@ -0,0 +1,147 @@ +#include "x86simdsort.h" +#include "x86simdsort-internal.h" +#include "x86simdsort-scalar.h" +#include +#include +#include + +static int check_cpu_feature_support(std::string_view cpufeature) +{ + if (cpufeature == "avx512_spr") +#ifdef __FLT16_MAX__ + return __builtin_cpu_supports("avx512f") + && __builtin_cpu_supports("avx512fp16") + && __builtin_cpu_supports("avx512vbmi2"); +#else + return 0; +#endif + else if (cpufeature == "avx512_icl") + return __builtin_cpu_supports("avx512f") + && __builtin_cpu_supports("avx512vbmi2") + && __builtin_cpu_supports("avx512bw") + && __builtin_cpu_supports("avx512vl"); + else if (cpufeature == "avx512_skx") + return __builtin_cpu_supports("avx512f") + && __builtin_cpu_supports("avx512dq") + && __builtin_cpu_supports("avx512vl"); + else if (cpufeature == "avx2") + return __builtin_cpu_supports("avx2"); + + return 0; +} + +std::string_view static find_preferred_cpu( + std::initializer_list cpulist) +{ + for (auto cpu : cpulist) { + if (check_cpu_feature_support(cpu)) return cpu; + } + return "scalar"; +} + +constexpr bool +dispatch_requested(std::string_view cpurequested, + std::initializer_list cpulist) +{ + for (auto cpu : cpulist) { + if (cpu.find(cpurequested) != std::string_view::npos) return true; + } + return false; +} + +#define CAT_(a, b) a##b +#define CAT(a, b) CAT_(a, b) + +#define DECLARE_INTERNAL_qsort(TYPE) \ + static void (*internal_qsort##TYPE)(TYPE *, size_t) = NULL; \ + template <> \ + void qsort(TYPE *arr, size_t arrsize) \ + { \ + (*internal_qsort##TYPE)(arr, arrsize); \ + } + +#define DECLARE_INTERNAL_qselect(TYPE) \ + static void (*internal_qselect##TYPE)(TYPE *, size_t, size_t, bool) \ + = NULL; \ + template <> \ + void qselect(TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ + { \ + (*internal_qselect##TYPE)(arr, k, arrsize, hasnan); \ + } + +#define DECLARE_INTERNAL_partial_qsort(TYPE) \ + static void (*internal_partial_qsort##TYPE)(TYPE *, size_t, size_t, bool) \ + = NULL; \ + template <> \ + void partial_qsort(TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ + { \ + (*internal_partial_qsort##TYPE)(arr, k, arrsize, hasnan); \ + } + +#define DECLARE_INTERNAL_argsort(TYPE) \ + static std::vector (*internal_argsort##TYPE)(TYPE *, size_t) \ + = NULL; \ + template <> \ + std::vector argsort(TYPE *arr, size_t arrsize) \ + { \ + return (*internal_argsort##TYPE)(arr, arrsize); \ + } + +#define DECLARE_INTERNAL_argselect(TYPE) \ + static std::vector (*internal_argselect##TYPE)( \ + TYPE *, size_t, size_t) \ + = NULL; \ + template <> \ + std::vector argselect(TYPE *arr, size_t k, size_t arrsize) \ + { \ + return (*internal_argselect##TYPE)(arr, k, arrsize); \ + } + +/* runtime dispatch mechanism */ +#define DISPATCH(func, TYPE, ...) \ + DECLARE_INTERNAL_##func(TYPE) static __attribute__((constructor)) void \ + CAT(CAT(resolve_, func), TYPE)(void) \ + { \ + CAT(CAT(internal_, func), TYPE) = &xss::scalar::func; \ + __builtin_cpu_init(); \ + std::string_view preferred_cpu = find_preferred_cpu({__VA_ARGS__}); \ + if constexpr (dispatch_requested("avx512", {__VA_ARGS__})) { \ + if (preferred_cpu.find("avx512") != std::string_view::npos) { \ + CAT(CAT(internal_, func), TYPE) = &xss::avx512::func; \ + return; \ + } \ + } \ + else if constexpr (dispatch_requested("avx2", {__VA_ARGS__})) { \ + if (preferred_cpu.find("avx2") != std::string_view::npos) { \ + CAT(CAT(internal_, func), TYPE) = &xss::avx2::func; \ + return; \ + } \ + } \ + } + +namespace x86simdsort { +#ifdef __FLT16_MAX__ +DISPATCH(qsort, _Float16, "avx512_spr") +DISPATCH(qselect, _Float16, "avx512_spr") +DISPATCH(partial_qsort, _Float16, "avx512_spr") +DISPATCH(argsort, _Float16, "none") +DISPATCH(argselect, _Float16, "none") +#endif + +#define DISPATCH_ALL(func, ISA_16BIT, ISA_32BIT, ISA_64BIT) \ + DISPATCH(func, uint16_t, ISA_16BIT) \ + DISPATCH(func, int16_t, ISA_16BIT) \ + DISPATCH(func, float, ISA_32BIT) \ + DISPATCH(func, int32_t, ISA_32BIT) \ + DISPATCH(func, uint32_t, ISA_32BIT) \ + DISPATCH(func, int64_t, ISA_64BIT) \ + DISPATCH(func, uint64_t, ISA_64BIT) \ + DISPATCH(func, double, ISA_64BIT) + +DISPATCH_ALL(qsort, ("avx512_icl"), ("avx512_skx"), ("avx512_skx")) +DISPATCH_ALL(qselect, ("avx512_icl"), ("avx512_skx"), ("avx512_skx")) +DISPATCH_ALL(partial_qsort, ("avx512_icl"), ("avx512_skx"), ("avx512_skx")) +DISPATCH_ALL(argsort, "none", "avx512_skx", "avx512_skx") +DISPATCH_ALL(argselect, "none", "avx512_skx", "avx512_skx") + +} // namespace x86simdsort diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h new file mode 100644 index 00000000..e3f54b39 --- /dev/null +++ b/lib/x86simdsort.h @@ -0,0 +1,29 @@ +#ifndef X86_SIMD_SORT +#define X86_SIMD_SORT +#include +#include + +#define XSS_EXPORT_SYMBOL __attribute__((visibility("default"))) +#define XSS_HIDE_SYMBOL __attribute__((visibility("hidden"))) + +namespace x86simdsort { +// quicksort +template +XSS_EXPORT_SYMBOL void qsort(T *arr, size_t arrsize); +// quickselect +template +XSS_EXPORT_SYMBOL void +qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); +// partial sort +template +XSS_EXPORT_SYMBOL void +partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); +// argsort +template +XSS_EXPORT_SYMBOL std::vector argsort(T *arr, size_t arrsize); +// argselect +template +XSS_EXPORT_SYMBOL std::vector +argselect(T *arr, size_t k, size_t arrsize); +} // namespace x86simdsort +#endif diff --git a/meson.build b/meson.build index a10598f9..c90e6eae 100644 --- a/meson.build +++ b/meson.build @@ -4,11 +4,12 @@ project('x86-simd-sort', 'cpp', default_options : ['cpp_std=c++17']) cpp = meson.get_compiler('cpp') src = include_directories('src') +lib = include_directories('lib') bench = include_directories('benchmarks') utils = include_directories('utils') tests = include_directories('tests') -gtest_dep = dependency('gtest_main', required : true, static: true) -gbench_dep = dependency('benchmark', required : true, static: true) +gtest_dep = dependency('gtest_main', required : false, static: false) +gbench_dep = dependency('benchmark', required : false, static: false) fp16code = '''#include int main() { @@ -18,25 +19,41 @@ int main() { } ''' cancompilefp16 = cpp.compiles(fp16code, args:'-march=sapphirerapids') +flags_hide_symbols = ['-fvisibility=hidden', '-fvisibility-inlines-hidden'] -subdir('tests') -subdir('benchmarks') +subdir('lib') +libsimdsort = shared_library('x86simdsort', + 'lib/x86simdsort.cpp', + include_directories : [utils, lib], + link_with : [libtargets], + cpp_args : [flags_hide_symbols], + ) -testexe = executable('testexe', - include_directories : [src, utils], +if gtest_dep.found() + subdir('tests') + testexe = executable('testexe', + include_directories : [lib, utils], dependencies : gtest_dep, - link_whole : [libtests] + link_whole : [libtests], + link_with : libsimdsort, ) +endif -benchexe = executable('benchexe', - include_directories : [src, utils, bench], +if gbench_dep.found() + subdir('benchmarks') + benchexe = executable('benchexe', + include_directories : [src, lib, utils, bench], dependencies : [gbench_dep], link_args: ['-lbenchmark_main'], link_whole : [libbench], + link_with : libsimdsort, ) +endif summary({ 'Can compile AVX-512 FP16 ISA': cancompilefp16, + 'Built test content': gtest_dep.found(), + 'Built benchmarks': gbench_dep.found(), }, section: 'Configuration', bool_yn: true diff --git a/run-bench.py b/run-bench.py index c93cfae9..cf86d0da 100644 --- a/run-bench.py +++ b/run-bench.py @@ -19,17 +19,17 @@ baseline = "" contender = "" if "qsort" in args.benchcompare: - baseline = "stdsort.*" + filterb - contender = "avx512qsort.*" + filterb - elif "qselect" in args.benchcompare: - baseline = "stdnthelement.*" + filterb - contender = "avx512_qselect.*" + filterb + baseline = "scalarsort.*" + filterb + contender = "simdsort.*" + filterb + elif "select" in args.benchcompare: + baseline = "scalarqselect.*" + filterb + contender = "simdqselect.*" + filterb elif "partial" in args.benchcompare: - baseline = "stdpartialsort.*" + filterb - contender = "avx512_partial_qsort.*" + filterb + baseline = "scalarpartialsort.*" + filterb + contender = "simdpartialsort.*" + filterb elif "argsort" in args.benchcompare: - baseline = "stdargsort.*" + filterb - contender = "avx512argsort.*" + filterb + baseline = "scalarargsort.*" + filterb + contender = "simdargsort.*" + filterb else: parser.print_help(sys.stderr) parser.error("ERROR: Unknown argument '%s'" % args.benchcompare) diff --git a/scripts/bench-compare.sh b/scripts/bench-compare.sh index 498a6b66..57347cce 100755 --- a/scripts/bench-compare.sh +++ b/scripts/bench-compare.sh @@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then fi compare=$(realpath .bench/google-benchmark/tools/compare.py) -meson setup --warnlevel 0 --buildtype plain builddir-${branch} +meson setup --warnlevel 0 --buildtype release builddir-${branch} cd builddir-${branch} ninja $compare filters ./benchexe $1 $2 diff --git a/scripts/branch-compare.sh b/scripts/branch-compare.sh index ff8b3474..6b6b6610 100755 --- a/scripts/branch-compare.sh +++ b/scripts/branch-compare.sh @@ -26,7 +26,7 @@ build_branch() { fi fi cd $dir_name - meson setup --warnlevel 0 --buildtype plain builddir + meson setup --warnlevel 0 --buildtype release builddir cd builddir ninja cd ../../ diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 13b732d0..edd118b3 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -423,10 +423,10 @@ bool comparison_func>(const uint16_t &a, const uint16_t &b) } template <> -int64_t replace_nan_with_inf>(uint16_t *arr, - int64_t arrsize) +arrsize_t replace_nan_with_inf>(uint16_t *arr, + arrsize_t arrsize) { - int64_t nan_count = 0; + arrsize_t nan_count = 0; __mmask16 loadmask = 0xFFFF; while (arrsize > 0) { if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; } @@ -445,29 +445,46 @@ int64_t replace_nan_with_inf>(uint16_t *arr, template <> bool is_a_nan(uint16_t elem) { - return (elem & 0x7c00) == 0x7c00; + return ((elem & 0x7c00u) == 0x7c00u) && + ((elem & 0x03ffu) != 0); } -void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE +void avx512_qsort_fp16(uint16_t *arr, arrsize_t arrsize) { if (arrsize > 1) { - int64_t nan_count = replace_nan_with_inf, uint16_t>( - arr, arrsize); + arrsize_t nan_count + = replace_nan_with_inf, uint16_t>(arr, + arrsize); qsort_, uint16_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); replace_inf_with_nan(arr, arrsize, nan_count); } } -void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize, bool hasnan) +X86_SIMD_SORT_INLINE +void avx512_qselect_fp16(uint16_t *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = true) { - int64_t indx_last_elem = arrsize - 1; + arrsize_t indx_last_elem = arrsize - 1; if (UNLIKELY(hasnan)) { indx_last_elem = move_nans_to_end_of_array(arr, arrsize); } if (indx_last_elem >= k) { qselect_, uint16_t>( - arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem)); + arr, k, 0, indx_last_elem, 2 * (arrsize_t)log2(indx_last_elem)); } } + +X86_SIMD_SORT_INLINE +void avx512_partial_qsort_fp16(uint16_t *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false) +{ + avx512_qselect_fp16(arr, k - 1, arrsize, hasnan); + avx512_qsort_fp16(arr, k - 1); +} #endif // AVX512_QSORT_16BIT diff --git a/src/avx512-64bit-argsort.hpp b/src/avx512-64bit-argsort.hpp index c9c5e961..4571a469 100644 --- a/src/avx512-64bit-argsort.hpp +++ b/src/avx512-64bit-argsort.hpp @@ -12,13 +12,13 @@ #include "avx512-common-argsort.h" template -void std_argselect_withnan( - T *arr, int64_t *arg, int64_t k, int64_t left, int64_t right) +X86_SIMD_SORT_INLINE void std_argselect_withnan( + T *arr, arrsize_t *arg, arrsize_t k, arrsize_t left, arrsize_t right) { std::nth_element(arg + left, arg + k, arg + right, - [arr](int64_t a, int64_t b) -> bool { + [arr](arrsize_t a, arrsize_t b) -> bool { if ((!std::isnan(arr[a])) && (!std::isnan(arr[b]))) { return arr[a] < arr[b]; } @@ -33,11 +33,12 @@ void std_argselect_withnan( /* argsort using std::sort */ template -void std_argsort_withnan(T *arr, int64_t *arg, int64_t left, int64_t right) +X86_SIMD_SORT_INLINE void +std_argsort_withnan(T *arr, arrsize_t *arg, arrsize_t left, arrsize_t right) { std::sort(arg + left, arg + right, - [arr](int64_t left, int64_t right) -> bool { + [arr](arrsize_t left, arrsize_t right) -> bool { if ((!std::isnan(arr[left])) && (!std::isnan(arr[right]))) { return arr[left] < arr[right]; } @@ -52,18 +53,20 @@ void std_argsort_withnan(T *arr, int64_t *arg, int64_t left, int64_t right) /* argsort using std::sort */ template -void std_argsort(T *arr, int64_t *arg, int64_t left, int64_t right) +X86_SIMD_SORT_INLINE void +std_argsort(T *arr, arrsize_t *arg, arrsize_t left, arrsize_t right) { std::sort(arg + left, arg + right, - [arr](int64_t left, int64_t right) -> bool { + [arr](arrsize_t left, arrsize_t right) -> bool { // sort indices according to corresponding array element return arr[left] < arr[right]; }); } template -X86_SIMD_SORT_INLINE void argsort_8_64bit(type_t *arr, int64_t *arg, int32_t N) +X86_SIMD_SORT_INLINE void +argsort_8_64bit(type_t *arr, arrsize_t *arg, int32_t N) { using reg_t = typename vtype::reg_t; typename vtype::opmask_t load_mask = (0x01 << N) - 0x01; @@ -75,7 +78,8 @@ X86_SIMD_SORT_INLINE void argsort_8_64bit(type_t *arr, int64_t *arg, int32_t N) } template -X86_SIMD_SORT_INLINE void argsort_16_64bit(type_t *arr, int64_t *arg, int32_t N) +X86_SIMD_SORT_INLINE void +argsort_16_64bit(type_t *arr, arrsize_t *arg, int32_t N) { if (N <= 8) { argsort_8_64bit(arr, arg, N); @@ -97,7 +101,8 @@ X86_SIMD_SORT_INLINE void argsort_16_64bit(type_t *arr, int64_t *arg, int32_t N) } template -X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N) +X86_SIMD_SORT_INLINE void +argsort_32_64bit(type_t *arr, arrsize_t *arg, int32_t N) { if (N <= 16) { argsort_16_64bit(arr, arg, N); @@ -108,7 +113,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N) reg_t arrzmm[4]; argreg_t argzmm[4]; -X86_SIMD_SORT_UNROLL_LOOP(2) + X86_SIMD_SORT_UNROLL_LOOP(2) for (int ii = 0; ii < 2; ++ii) { argzmm[ii] = argtype::loadu(arg + 8 * ii); arrzmm[ii] = vtype::i64gather(arr, arg + 8 * ii); @@ -117,7 +122,7 @@ X86_SIMD_SORT_UNROLL_LOOP(2) uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull; opmask_t load_mask[2] = {0xFF, 0xFF}; -X86_SIMD_SORT_UNROLL_LOOP(2) + X86_SIMD_SORT_UNROLL_LOOP(2) for (int ii = 0; ii < 2; ++ii) { load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF; argzmm[ii + 2] = argtype::maskz_loadu(load_mask[ii], arg + 16 + 8 * ii); @@ -140,7 +145,8 @@ X86_SIMD_SORT_UNROLL_LOOP(2) } template -X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) +X86_SIMD_SORT_INLINE void +argsort_64_64bit(type_t *arr, arrsize_t *arg, int32_t N) { if (N <= 32) { argsort_32_64bit(arr, arg, N); @@ -151,7 +157,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) reg_t arrzmm[8]; argreg_t argzmm[8]; -X86_SIMD_SORT_UNROLL_LOOP(4) + X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argzmm[ii] = argtype::loadu(arg + 8 * ii); arrzmm[ii] = vtype::i64gather(arr, arg + 8 * ii); @@ -160,7 +166,7 @@ X86_SIMD_SORT_UNROLL_LOOP(4) opmask_t load_mask[4] = {0xFF, 0xFF, 0xFF, 0xFF}; uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull; -X86_SIMD_SORT_UNROLL_LOOP(4) + X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF; argzmm[ii + 4] = argtype::maskz_loadu(load_mask[ii], arg + 32 + 8 * ii); @@ -170,7 +176,7 @@ X86_SIMD_SORT_UNROLL_LOOP(4) argzmm[ii + 4]); } -X86_SIMD_SORT_UNROLL_LOOP(4) + X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 8; ii = ii + 2) { bitonic_merge_two_zmm_64bit( arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]); @@ -179,11 +185,11 @@ X86_SIMD_SORT_UNROLL_LOOP(4) bitonic_merge_four_zmm_64bit(arrzmm + 4, argzmm + 4); bitonic_merge_eight_zmm_64bit(arrzmm, argzmm); -X86_SIMD_SORT_UNROLL_LOOP(4) + X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argtype::storeu(arg + 8 * ii, argzmm[ii]); } -X86_SIMD_SORT_UNROLL_LOOP(4) + X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argtype::mask_storeu(arg + 32 + 8 * ii, load_mask[ii], argzmm[ii + 4]); } @@ -192,7 +198,7 @@ X86_SIMD_SORT_UNROLL_LOOP(4) /* arsort 128 doesn't seem to make much of a difference to perf*/ //template //X86_SIMD_SORT_INLINE void -//argsort_128_64bit(type_t *arr, int64_t *arg, int32_t N) +//argsort_128_64bit(type_t *arr, arrsize_t *arg, int32_t N) //{ // if (N <= 64) { // argsort_64_64bit(arr, arg, N); @@ -212,7 +218,7 @@ X86_SIMD_SORT_UNROLL_LOOP(4) // // opmask_t load_mask[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; // if (N != 128) { -// uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; +// uarrsize_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; //X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // load_mask[ii] = (combined_mask >> (ii*8)) & 0xFF; @@ -248,14 +254,14 @@ X86_SIMD_SORT_UNROLL_LOOP(4) //} template -type_t get_pivot_64bit(type_t *arr, - int64_t *arg, - const int64_t left, - const int64_t right) +X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, + arrsize_t *arg, + const arrsize_t left, + const arrsize_t right) { if (right - left >= vtype::numlanes) { // median of 8 - int64_t size = (right - left) / 8; + arrsize_t size = (right - left) / 8; using reg_t = typename vtype::reg_t; reg_t rand_vec = vtype::set(arr[arg[left + size]], arr[arg[left + 2 * size]], @@ -275,11 +281,11 @@ type_t get_pivot_64bit(type_t *arr, } template -inline void argsort_64bit_(type_t *arr, - int64_t *arg, - int64_t left, - int64_t right, - int64_t max_iters) +X86_SIMD_SORT_INLINE void argsort_64bit_(type_t *arr, + arrsize_t *arg, + arrsize_t left, + arrsize_t right, + arrsize_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -298,7 +304,7 @@ inline void argsort_64bit_(type_t *arr, type_t pivot = get_pivot_64bit(arr, arg, left, right); type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - int64_t pivot_index = partition_avx512_unrolled( + arrsize_t pivot_index = partition_avx512_unrolled( arr, arg, left, right + 1, pivot, &smallest, &biggest); if (pivot != smallest) argsort_64bit_(arr, arg, left, pivot_index - 1, max_iters - 1); @@ -307,12 +313,12 @@ inline void argsort_64bit_(type_t *arr, } template -static void argselect_64bit_(type_t *arr, - int64_t *arg, - int64_t pos, - int64_t left, - int64_t right, - int64_t max_iters) +X86_SIMD_SORT_INLINE void argselect_64bit_(type_t *arr, + arrsize_t *arg, + arrsize_t pos, + arrsize_t left, + arrsize_t right, + arrsize_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -331,7 +337,7 @@ static void argselect_64bit_(type_t *arr, type_t pivot = get_pivot_64bit(arr, arg, left, right); type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - int64_t pivot_index = partition_avx512_unrolled( + arrsize_t pivot_index = partition_avx512_unrolled( arr, arg, left, right + 1, pivot, &smallest, &biggest); if ((pivot != smallest) && (pos < pivot_index)) argselect_64bit_( @@ -343,7 +349,8 @@ static void argselect_64bit_(type_t *arr, /* argsort methods for 32-bit and 64-bit dtypes */ template -void avx512_argsort(T *arr, int64_t *arg, int64_t arrsize) +X86_SIMD_SORT_INLINE void +avx512_argsort(T *arr, arrsize_t *arg, arrsize_t arrsize) { using vectype = typename std::conditional, @@ -356,14 +363,15 @@ void avx512_argsort(T *arr, int64_t *arg, int64_t arrsize) } } argsort_64bit_( - arr, arg, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, arg, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); } } template -std::vector avx512_argsort(T *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE std::vector avx512_argsort(T *arr, + arrsize_t arrsize) { - std::vector indices(arrsize); + std::vector indices(arrsize); std::iota(indices.begin(), indices.end(), 0); avx512_argsort(arr, indices.data(), arrsize); return indices; @@ -371,7 +379,8 @@ std::vector avx512_argsort(T *arr, int64_t arrsize) /* argselect methods for 32-bit and 64-bit dtypes */ template -void avx512_argselect(T *arr, int64_t *arg, int64_t k, int64_t arrsize) +X86_SIMD_SORT_INLINE void +avx512_argselect(T *arr, arrsize_t *arg, arrsize_t k, arrsize_t arrsize) { using vectype = typename std::conditional, @@ -385,17 +394,34 @@ void avx512_argselect(T *arr, int64_t *arg, int64_t k, int64_t arrsize) } } argselect_64bit_( - arr, arg, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, arg, k, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); } } template -std::vector avx512_argselect(T *arr, int64_t k, int64_t arrsize) +X86_SIMD_SORT_INLINE std::vector +avx512_argselect(T *arr, arrsize_t k, arrsize_t arrsize) { - std::vector indices(arrsize); + std::vector indices(arrsize); std::iota(indices.begin(), indices.end(), 0); avx512_argselect(arr, indices.data(), k, arrsize); return indices; } +/* To maintain compatibility with NumPy build */ +template +X86_SIMD_SORT_INLINE void +avx512_argselect(T *arr, int64_t *arg, arrsize_t k, arrsize_t arrsize) +{ + avx512_argselect(arr, reinterpret_cast(arg), k, arrsize); +} + +template +X86_SIMD_SORT_INLINE void +avx512_argsort(T *arr, int64_t *arg, arrsize_t arrsize) +{ + avx512_argsort(arr, reinterpret_cast(arg), arrsize); +} + + #endif // AVX512_ARGSORT_64BIT diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 3227e071..f9018231 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -96,7 +96,7 @@ struct ymm_vector { { return _mm512_mask_i64gather_ps(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -248,7 +248,7 @@ struct ymm_vector { { return _mm512_mask_i64gather_epi32(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -394,7 +394,7 @@ struct ymm_vector { { return _mm512_mask_i64gather_epi32(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -543,7 +543,7 @@ struct zmm_vector { { return _mm512_mask_i64gather_epi64(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -673,7 +673,7 @@ struct zmm_vector { { return _mm512_mask_i64gather_epi64(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -708,6 +708,10 @@ struct zmm_vector { { return _mm512_mask_compressstoreu_epi64(mem, mask, x); } + static reg_t maskz_loadu(opmask_t mask, void const *mem) + { + return _mm512_maskz_loadu_epi64(mask, mem); + } static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm512_mask_loadu_epi64(x, mask, mem); @@ -835,7 +839,7 @@ struct zmm_vector { { return _mm512_mask_i64gather_pd(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], diff --git a/src/avx512-64bit-keyvalue-networks.hpp b/src/avx512-64bit-keyvalue-networks.hpp index e9577b79..12527f95 100644 --- a/src/avx512-64bit-keyvalue-networks.hpp +++ b/src/avx512-64bit-keyvalue-networks.hpp @@ -1,4 +1,5 @@ - +#ifndef AVX512_KEYVALUE_NETWORKS +#define AVX512_KEYVALUE_NETWORKS template -void heapify(type1_t *keys, type2_t *indexes, int64_t idx, int64_t size) +X86_SIMD_SORT_INLINE void +heapify(type1_t *keys, type2_t *indexes, arrsize_t idx, arrsize_t size) { - int64_t i = idx; + arrsize_t i = idx; while (true) { - int64_t j = 2 * i + 1; + arrsize_t j = 2 * i + 1; if (j >= size || j < 0) { break; } int k = j + 1; if (k < size && keys[j] < keys[k]) { j = k; } @@ -383,12 +384,13 @@ template -void heap_sort(type1_t *keys, type2_t *indexes, int64_t size) +X86_SIMD_SORT_INLINE void +heap_sort(type1_t *keys, type2_t *indexes, arrsize_t size) { - for (int64_t i = size / 2 - 1; i >= 0; i--) { + for (arrsize_t i = size / 2 - 1; i >= 0; i--) { heapify(keys, indexes, i, size); } - for (int64_t i = size - 1; i > 0; i--) { + for (arrsize_t i = size - 1; i > 0; i--) { std::swap(keys[0], keys[i]); std::swap(indexes[0], indexes[i]); heapify(keys, indexes, 0, i); @@ -399,11 +401,11 @@ template -void qsort_64bit_(type1_t *keys, - type2_t *indexes, - int64_t left, - int64_t right, - int64_t max_iters) +X86_SIMD_SORT_INLINE void qsort_64bit_(type1_t *keys, + type2_t *indexes, + arrsize_t left, + arrsize_t right, + arrsize_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -427,7 +429,7 @@ void qsort_64bit_(type1_t *keys, type1_t pivot = get_pivot(keys, left, right); type1_t smallest = vtype1::type_max(); type1_t biggest = vtype1::type_min(); - int64_t pivot_index = partition_avx512( + arrsize_t pivot_index = partition_avx512( keys, indexes, left, right + 1, pivot, &smallest, &biggest); if (pivot != smallest) { qsort_64bit_( @@ -440,19 +442,28 @@ void qsort_64bit_(type1_t *keys, } template -void avx512_qsort_kv(T1 *keys, T2 *indexes, int64_t arrsize) +X86_SIMD_SORT_INLINE void +avx512_qsort_kv(T1 *keys, T2 *indexes, arrsize_t arrsize) { if (arrsize > 1) { if constexpr (std::is_floating_point_v) { - int64_t nan_count + arrsize_t nan_count = replace_nan_with_inf>(keys, arrsize); qsort_64bit_, zmm_vector>( - keys, indexes, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + keys, + indexes, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize)); replace_inf_with_nan(keys, arrsize, nan_count); } else { qsort_64bit_, zmm_vector>( - keys, indexes, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + keys, + indexes, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize)); } } } diff --git a/src/avx512-common-argsort.h b/src/avx512-common-argsort.h index 375afc0b..357d143c 100644 --- a/src/avx512-common-argsort.h +++ b/src/avx512-common-argsort.h @@ -12,7 +12,7 @@ #include #include -using argtype = zmm_vector; +using argtype = zmm_vector; using argreg_t = typename argtype::reg_t; /* @@ -20,14 +20,14 @@ using argreg_t = typename argtype::reg_t; * last element that is less than equal to the pivot. */ template -static inline int32_t partition_vec(type_t *arg, - int64_t left, - int64_t right, - const argreg_t arg_vec, - const reg_t curr_vec, - const reg_t pivot_vec, - reg_t *smallest_vec, - reg_t *biggest_vec) +X86_SIMD_SORT_INLINE int32_t partition_vec(type_t *arg, + arrsize_t left, + arrsize_t right, + const argreg_t arg_vec, + const reg_t curr_vec, + const reg_t pivot_vec, + reg_t *smallest_vec, + reg_t *biggest_vec) { /* which elements are larger than the pivot */ typename vtype::opmask_t gt_mask = vtype::ge(curr_vec, pivot_vec); @@ -45,13 +45,13 @@ static inline int32_t partition_vec(type_t *arg, * last element that is less than equal to the pivot. */ template -static inline int64_t partition_avx512(type_t *arr, - int64_t *arg, - int64_t left, - int64_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) +X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, + arrsize_t *arg, + arrsize_t left, + arrsize_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) { /* make array length divisible by vtype::numlanes , shortening the array */ for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { @@ -95,8 +95,8 @@ static inline int64_t partition_avx512(type_t *arr, argreg_t argvec_right = argtype::loadu(arg + (right - vtype::numlanes)); reg_t vec_right = vtype::i64gather(arr, arg + (right - vtype::numlanes)); // store points of the vectors - int64_t r_store = right - vtype::numlanes; - int64_t l_store = left; + arrsize_t r_store = right - vtype::numlanes; + arrsize_t l_store = left; // indices for loading the elements left += vtype::numlanes; right -= vtype::numlanes; @@ -160,13 +160,13 @@ static inline int64_t partition_avx512(type_t *arr, template -static inline int64_t partition_avx512_unrolled(type_t *arr, - int64_t *arg, - int64_t left, - int64_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) +X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, + arrsize_t *arg, + arrsize_t left, + arrsize_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) { if (right - left <= 8 * num_unroll * vtype::numlanes) { return partition_avx512( @@ -196,7 +196,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, // first and last vtype::numlanes values are partitioned at the end reg_t vec_left[num_unroll], vec_right[num_unroll]; argreg_t argvec_left[num_unroll], argvec_right[num_unroll]; -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { argvec_left[ii] = argtype::loadu(arg + left + vtype::numlanes * ii); vec_left[ii] = vtype::i64gather(arr, arg + left + vtype::numlanes * ii); @@ -206,8 +206,8 @@ X86_SIMD_SORT_UNROLL_LOOP(8) arr, arg + (right - vtype::numlanes * (num_unroll - ii))); } // store points of the vectors - int64_t r_store = right - vtype::numlanes; - int64_t l_store = left; + arrsize_t r_store = right - vtype::numlanes; + arrsize_t l_store = left; // indices for loading the elements left += num_unroll * vtype::numlanes; right -= num_unroll * vtype::numlanes; @@ -221,7 +221,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) */ if ((r_store + vtype::numlanes) - right < left - l_store) { right -= num_unroll * vtype::numlanes; -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { arg_vec[ii] = argtype::loadu(arg + right + ii * vtype::numlanes); @@ -230,7 +230,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) } } else { -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { arg_vec[ii] = argtype::loadu(arg + left + ii * vtype::numlanes); curr_vec[ii] = vtype::i64gather( @@ -239,7 +239,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) left += num_unroll * vtype::numlanes; } // partition the current vector and save it on both sides of the array -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, @@ -256,7 +256,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) } /* partition and save vec_left and vec_right */ -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, @@ -270,7 +270,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) l_store += (vtype::numlanes - amount_gt_pivot); r_store -= amount_gt_pivot; } -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index 349f51b5..99717207 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -20,7 +20,7 @@ * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting * network. The core implementations of the vectorized qsort functions - * avx512_qsort(T*, int64_t) are modified versions of avx2 quicksort + * avx512_qsort(T*, arrsize_t) are modified versions of avx2 quicksort * presented in the paper [2] and source code associated with that paper [3]. * * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types @@ -67,7 +67,7 @@ #define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16) #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d -#define PRAGMA(x) _Pragma (#x) +#define PRAGMA(x) _Pragma(#x) /* Compiler specific macros specific */ #ifdef _MSC_VER @@ -100,6 +100,8 @@ #define X86_SIMD_SORT_UNROLL_LOOP(num) #endif +typedef size_t arrsize_t; + template struct zmm_vector; @@ -113,49 +115,49 @@ bool is_a_nan(T elem) } template -int64_t replace_nan_with_inf(T *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE arrsize_t replace_nan_with_inf(T *arr, arrsize_t size) { - int64_t nan_count = 0; + arrsize_t nan_count = 0; using opmask_t = typename vtype::opmask_t; using reg_t = typename vtype::reg_t; opmask_t loadmask; reg_t in; - while (arrsize > 0) { - if (arrsize < vtype::numlanes) { - loadmask = vtype::get_partial_loadmask(arrsize); - in = vtype::maskz_loadu(loadmask, arr); + /* + * (ii + numlanes) can never overflow: max val of size is 2**63 on 64-bit + * and 2**31 on 32-bit systems + */ + for (arrsize_t ii = 0; ii < size; ii = ii + vtype::numlanes) { + if (size - ii < vtype::numlanes) { + loadmask = vtype::get_partial_loadmask(size - ii); + in = vtype::maskz_loadu(loadmask, arr + ii); } else { - in = vtype::loadu(arr); + in = vtype::loadu(arr + ii); } opmask_t nanmask = vtype::template fpclass<0x01 | 0x80>(in); nan_count += _mm_popcnt_u32((int32_t)nanmask); - vtype::mask_storeu(arr, nanmask, vtype::zmm_max()); - arr += vtype::numlanes; - arrsize -= vtype::numlanes; + vtype::mask_storeu(arr + ii, nanmask, vtype::zmm_max()); } return nan_count; } template -bool has_nan(type_t *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE bool has_nan(type_t *arr, arrsize_t size) { using opmask_t = typename vtype::opmask_t; using reg_t = typename vtype::reg_t; bool found_nan = false; opmask_t loadmask; reg_t in; - while (arrsize > 0) { - if (arrsize < vtype::numlanes) { - loadmask = vtype::get_partial_loadmask(arrsize); - in = vtype::maskz_loadu(loadmask, arr); + for (arrsize_t ii = 0; ii < size; ii = ii + vtype::numlanes) { + if (size - ii < vtype::numlanes) { + loadmask = vtype::get_partial_loadmask(size - ii); + in = vtype::maskz_loadu(loadmask, arr + ii); } else { - in = vtype::loadu(arr); + in = vtype::loadu(arr + ii); } opmask_t nanmask = vtype::template fpclass<0x01 | 0x80>(in); - arr += vtype::numlanes; - arrsize -= vtype::numlanes; if (nanmask != 0x00) { found_nan = true; break; @@ -165,9 +167,10 @@ bool has_nan(type_t *arr, int64_t arrsize) } template -void replace_inf_with_nan(type_t *arr, int64_t arrsize, int64_t nan_count) +X86_SIMD_SORT_INLINE void +replace_inf_with_nan(type_t *arr, arrsize_t size, arrsize_t nan_count) { - for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { + for (arrsize_t ii = size - 1; nan_count > 0; --ii) { if constexpr (std::is_floating_point_v) { arr[ii] = std::numeric_limits::quiet_NaN(); } @@ -183,12 +186,12 @@ void replace_inf_with_nan(type_t *arr, int64_t arrsize, int64_t nan_count) * in the array which is not a nan */ template -int64_t move_nans_to_end_of_array(T *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE arrsize_t move_nans_to_end_of_array(T *arr, arrsize_t size) { - int64_t jj = arrsize - 1; - int64_t ii = 0; - int64_t count = 0; - while (ii <= jj) { + arrsize_t jj = size - 1; + arrsize_t ii = 0; + arrsize_t count = 0; + while (ii < jj) { if (is_a_nan(arr[ii])) { std::swap(arr[ii], arr[jj]); jj -= 1; @@ -198,11 +201,15 @@ int64_t move_nans_to_end_of_array(T *arr, int64_t arrsize) ii += 1; } } - return arrsize - count - 1; + /* Haven't checked for nan when ii == jj */ + if (is_a_nan(arr[ii])) { + count++; + } + return size - count - 1; } template -bool comparison_func(const T &a, const T &b) +X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b) { return a < b; } @@ -211,16 +218,17 @@ bool comparison_func(const T &a, const T &b) * COEX == Compare and Exchange two registers by swapping min and max values */ template -static void COEX(mm_t &a, mm_t &b) +X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b) { mm_t temp = a; a = vtype::min(a, b); b = vtype::max(temp, b); } + template -static inline reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) +X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) { reg_t min = vtype::min(in2, in1); reg_t max = vtype::max(in2, in1); @@ -231,13 +239,13 @@ static inline reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) * number of elements that are greater than or equal to the pivot. */ template -static inline int32_t partition_vec(type_t *arr, - int64_t left, - int64_t right, - const reg_t curr_vec, - const reg_t pivot_vec, - reg_t *smallest_vec, - reg_t *biggest_vec) +X86_SIMD_SORT_INLINE int32_t partition_vec(type_t *arr, + arrsize_t left, + arrsize_t right, + const reg_t curr_vec, + const reg_t pivot_vec, + reg_t *smallest_vec, + reg_t *biggest_vec) { /* which elements are larger than or equal to the pivot */ typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec); @@ -255,12 +263,12 @@ static inline int32_t partition_vec(type_t *arr, * first element that is greater than or equal to the pivot. */ template -static inline int64_t partition_avx512(type_t *arr, - int64_t left, - int64_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) +X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, + arrsize_t left, + arrsize_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) { /* make array length divisible by vtype::numlanes , shortening the array */ for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { @@ -300,8 +308,8 @@ static inline int64_t partition_avx512(type_t *arr, reg_t vec_left = vtype::loadu(arr + left); reg_t vec_right = vtype::loadu(arr + (right - vtype::numlanes)); // store points of the vectors - int64_t r_store = right - vtype::numlanes; - int64_t l_store = left; + arrsize_t r_store = right - vtype::numlanes; + arrsize_t l_store = left; // indices for loading the elements left += vtype::numlanes; right -= vtype::numlanes; @@ -359,12 +367,12 @@ static inline int64_t partition_avx512(type_t *arr, template -static inline int64_t partition_avx512_unrolled(type_t *arr, - int64_t left, - int64_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) +X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, + arrsize_t left, + arrsize_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) { if constexpr (num_unroll == 0) { return partition_avx512( @@ -399,15 +407,15 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, // We will now have atleast 16 registers worth of data to process: // left and right vtype::numlanes values are partitioned at the end reg_t vec_left[num_unroll], vec_right[num_unroll]; -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii); vec_right[ii] = vtype::loadu( arr + (right - vtype::numlanes * (num_unroll - ii))); } // store points of the vectors - int64_t r_store = right - vtype::numlanes; - int64_t l_store = left; + arrsize_t r_store = right - vtype::numlanes; + arrsize_t l_store = left; // indices for loading the elements left += num_unroll * vtype::numlanes; right -= num_unroll * vtype::numlanes; @@ -420,20 +428,20 @@ X86_SIMD_SORT_UNROLL_LOOP(8) */ if ((r_store + vtype::numlanes) - right < left - l_store) { right -= num_unroll * vtype::numlanes; -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes); } } else { -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes); } left += num_unroll * vtype::numlanes; } -// partition the current vector and save it on both sides of the array -X86_SIMD_SORT_UNROLL_LOOP(8) + // partition the current vector and save it on both sides of the array + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, @@ -448,8 +456,8 @@ X86_SIMD_SORT_UNROLL_LOOP(8) } } -/* partition and save vec_left[8] and vec_right[8] */ -X86_SIMD_SORT_UNROLL_LOOP(8) + /* partition and save vec_left[8] and vec_right[8] */ + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, @@ -462,7 +470,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) l_store += (vtype::numlanes - amount_ge_pivot); r_store -= amount_ge_pivot; } -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, @@ -486,7 +494,8 @@ template -static void COEX(reg_t1 &key1, reg_t1 &key2, reg_t2 &index1, reg_t2 &index2) +X86_SIMD_SORT_INLINE void +COEX(reg_t1 &key1, reg_t1 &key2, reg_t2 &index1, reg_t2 &index2) { reg_t1 key_t1 = vtype1::min(key1, key2); reg_t1 key_t2 = vtype1::max(key1, key2); @@ -506,11 +515,11 @@ template -static inline reg_t1 cmp_merge(reg_t1 in1, - reg_t1 in2, - reg_t2 &indexes1, - reg_t2 indexes2, - opmask_t mask) +X86_SIMD_SORT_INLINE reg_t1 cmp_merge(reg_t1 in1, + reg_t1 in2, + reg_t2 &indexes1, + reg_t2 indexes2, + opmask_t mask) { reg_t1 tmp_keys = cmp_merge(in1, in2, mask); indexes1 = vtype2::mask_mov(indexes2, vtype1::eq(tmp_keys, in1), indexes1); @@ -527,15 +536,15 @@ template -static inline int32_t partition_vec(type_t1 *keys, - type_t2 *indexes, - int64_t left, - int64_t right, - const reg_t1 keys_vec, - const reg_t2 indexes_vec, - const reg_t1 pivot_vec, - reg_t1 *smallest_vec, - reg_t1 *biggest_vec) +X86_SIMD_SORT_INLINE int32_t partition_vec(type_t1 *keys, + type_t2 *indexes, + arrsize_t left, + arrsize_t right, + const reg_t1 keys_vec, + const reg_t2 indexes_vec, + const reg_t1 pivot_vec, + reg_t1 *smallest_vec, + reg_t1 *biggest_vec) { /* which elements are larger than the pivot */ typename vtype1::opmask_t gt_mask = vtype1::ge(keys_vec, pivot_vec); @@ -562,13 +571,13 @@ template -static inline int64_t partition_avx512(type_t1 *keys, - type_t2 *indexes, - int64_t left, - int64_t right, - type_t1 pivot, - type_t1 *smallest, - type_t1 *biggest) +X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t1 *keys, + type_t2 *indexes, + arrsize_t left, + arrsize_t right, + type_t1 pivot, + type_t1 *smallest, + type_t1 *biggest) { /* make array length divisible by vtype1::numlanes , shortening the array */ for (int32_t i = (right - left) % vtype1::numlanes; i > 0; --i) { @@ -620,8 +629,8 @@ static inline int64_t partition_avx512(type_t1 *keys, indexes_vec_right = vtype2::loadu(indexes + (right - vtype1::numlanes)); // store points of the vectors - int64_t r_store = right - vtype1::numlanes; - int64_t l_store = left; + arrsize_t r_store = right - vtype1::numlanes; + arrsize_t l_store = left; // indices for loading the elements left += vtype1::numlanes; right -= vtype1::numlanes; @@ -689,13 +698,13 @@ static inline int64_t partition_avx512(type_t1 *keys, template X86_SIMD_SORT_INLINE type_t get_pivot_scalar(type_t *arr, - const int64_t left, - const int64_t right) + const arrsize_t left, + const arrsize_t right) { - constexpr int64_t numSamples = vtype::numlanes; + constexpr arrsize_t numSamples = vtype::numlanes; type_t samples[numSamples]; - int64_t delta = (right - left) / numSamples; + arrsize_t delta = (right - left) / numSamples; for (int i = 0; i < numSamples; i++) { samples[i] = arr[left + i * delta]; @@ -708,11 +717,11 @@ X86_SIMD_SORT_INLINE type_t get_pivot_scalar(type_t *arr, template X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr, - const int64_t left, - const int64_t right) + const arrsize_t left, + const arrsize_t right) { // median of 32 - int64_t size = (right - left) / 32; + arrsize_t size = (right - left) / 32; type_t vec_arr[32] = {arr[left], arr[left + size], arr[left + 2 * size], @@ -752,11 +761,11 @@ X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr, template X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, - const int64_t left, - const int64_t right) + const arrsize_t left, + const arrsize_t right) { // median of 16 - int64_t size = (right - left) / 16; + arrsize_t size = (right - left) / 16; using reg_t = typename vtype::reg_t; type_t vec_arr[16] = {arr[left + size], arr[left + 2 * size], @@ -782,11 +791,11 @@ X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, template X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, - const int64_t left, - const int64_t right) + const arrsize_t left, + const arrsize_t right) { // median of 8 - int64_t size = (right - left) / 8; + arrsize_t size = (right - left) / 8; using reg_t = typename vtype::reg_t; reg_t rand_vec = vtype::set(arr[left + size], arr[left + 2 * size], @@ -803,8 +812,8 @@ X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, template X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, - const int64_t left, - const int64_t right) + const arrsize_t left, + const arrsize_t right) { if constexpr (vtype::numlanes == 8) return get_pivot_64bit(arr, left, right); @@ -816,11 +825,12 @@ X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, return get_pivot_scalar(arr, left, right); } -template -X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N); +template +void sort_n(typename vtype::type_t *arr, int N); template -static void qsort_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) +X86_SIMD_SORT_INLINE void +qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -842,7 +852,7 @@ static void qsort_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - int64_t pivot_index + arrsize_t pivot_index = partition_avx512_unrolled( arr, left, right + 1, pivot, &smallest, &biggest); @@ -852,11 +862,11 @@ static void qsort_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) } template -static void qselect_(type_t *arr, - int64_t pos, - int64_t left, - int64_t right, - int64_t max_iters) +X86_SIMD_SORT_INLINE void qselect_(type_t *arr, + arrsize_t pos, + arrsize_t left, + arrsize_t right, + arrsize_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -878,7 +888,7 @@ static void qselect_(type_t *arr, type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - int64_t pivot_index + arrsize_t pivot_index = partition_avx512_unrolled( arr, left, right + 1, pivot, &smallest, &biggest); @@ -890,30 +900,29 @@ static void qselect_(type_t *arr, // Regular quicksort routines: template -void avx512_qsort(T *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE void avx512_qsort(T *arr, arrsize_t arrsize) { if (arrsize > 1) { /* std::is_floating_point_v<_Float16> == False, unless c++-23*/ if constexpr (std::is_floating_point_v) { - int64_t nan_count + arrsize_t nan_count = replace_nan_with_inf>(arr, arrsize); qsort_, T>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); replace_inf_with_nan(arr, arrsize, nan_count); } else { qsort_, T>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); } } } -void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize); - template -void avx512_qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false) +X86_SIMD_SORT_INLINE void +avx512_qselect(T *arr, arrsize_t k, arrsize_t arrsize, bool hasnan = false) { - int64_t indx_last_elem = arrsize - 1; + arrsize_t indx_last_elem = arrsize - 1; /* std::is_floating_point_v<_Float16> == False, unless c++-23*/ if constexpr (std::is_floating_point_v) { if (UNLIKELY(hasnan)) { @@ -922,29 +931,18 @@ void avx512_qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false) } if (indx_last_elem >= k) { qselect_, T>( - arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem)); + arr, k, 0, indx_last_elem, 2 * (arrsize_t)log2(indx_last_elem)); } } -void avx512_qselect_fp16(uint16_t *arr, - int64_t k, - int64_t arrsize, - bool hasnan = false); - template -inline void -avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan = false) +X86_SIMD_SORT_INLINE void avx512_partial_qsort(T *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false) { avx512_qselect(arr, k - 1, arrsize, hasnan); avx512_qsort(arr, k - 1); } -inline void avx512_partial_qsort_fp16(uint16_t *arr, - int64_t k, - int64_t arrsize, - bool hasnan = false) -{ - avx512_qselect_fp16(arr, k - 1, arrsize, hasnan); - avx512_qsort_fp16(arr, k - 1); -} #endif // AVX512_QSORT_COMMON diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 9874b6fd..7d0f0a06 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -145,28 +145,43 @@ struct zmm_vector<_Float16> { template <> bool is_a_nan<_Float16>(_Float16 elem) { - Fp16Bits temp; - temp.f_ = elem; - return (temp.i_ & 0x7c00) == 0x7c00; + return elem != elem; } template <> -void replace_inf_with_nan(_Float16 *arr, int64_t arrsize, int64_t nan_count) +void replace_inf_with_nan(_Float16 *arr, arrsize_t size, arrsize_t nan_count) { - memset(arr + arrsize - nan_count, 0xFF, nan_count * 2); + Fp16Bits val; + val.i_ = 0x7c01; + for (arrsize_t ii = size - 1; nan_count > 0; --ii) { + arr[ii] = val.f_; + nan_count -= 1; + } } - /* Specialized template function for _Float16 qsort_*/ template <> -void avx512_qsort(_Float16 *arr, int64_t arrsize) +void avx512_qsort(_Float16 *arr, arrsize_t arrsize) { if (arrsize > 1) { - int64_t nan_count + arrsize_t nan_count = replace_nan_with_inf, _Float16>(arr, arrsize); qsort_, _Float16>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); replace_inf_with_nan(arr, arrsize, nan_count); } } + +template <> +void avx512_qselect(_Float16 *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) +{ + arrsize_t indx_last_elem = arrsize - 1; + if (UNLIKELY(hasnan)) { + indx_last_elem = move_nans_to_end_of_array(arr, arrsize); + } + if (indx_last_elem >= k) { + qselect_, _Float16>( + arr, k, 0, indx_last_elem, 2 * (arrsize_t)log2(indx_last_elem)); + } +} #endif // AVX512FP16_QSORT_16BIT diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp index 09b68f70..ddfa615a 100644 --- a/src/xss-network-qsort.hpp +++ b/src/xss-network-qsort.hpp @@ -3,16 +3,14 @@ #include "avx512-common-qsort.h" -template +template X86_SIMD_SORT_INLINE void bitonic_clean_n_vec(reg_t *regs) { -X86_SIMD_SORT_UNROLL_LOOP(64) + X86_SIMD_SORT_UNROLL_LOOP(64) for (int num = numVecs / 2; num >= 2; num /= 2) { -X86_SIMD_SORT_UNROLL_LOOP(64) + X86_SIMD_SORT_UNROLL_LOOP(64) for (int j = 0; j < numVecs; j += num) { -X86_SIMD_SORT_UNROLL_LOOP(64) + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < num / 2; i++) { COEX(regs[i + j], regs[i + j + num / 2]); } @@ -20,9 +18,7 @@ X86_SIMD_SORT_UNROLL_LOOP(64) } } -template +template X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs) { // Do the reverse part @@ -31,8 +27,8 @@ X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs) COEX(regs[0], regs[1]); } else if constexpr (numVecs > 2) { -// Reverse upper half -X86_SIMD_SORT_UNROLL_LOOP(64) + // Reverse upper half + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs / 2; i++) { reg_t rev = vtype::reverse(regs[numVecs - i - 1]); reg_t maxV = vtype::max(regs[i], rev); @@ -45,23 +41,23 @@ X86_SIMD_SORT_UNROLL_LOOP(64) // Call cleaner bitonic_clean_n_vec(regs); -// Now do bitonic_merge -X86_SIMD_SORT_UNROLL_LOOP(64) + // Now do bitonic_merge + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs; i++) { regs[i] = vtype::bitonic_merge(regs[i]); } } template X86_SIMD_SORT_INLINE void bitonic_fullmerge_n_vec(reg_t *regs) { if constexpr (numPer > numVecs) return; else { -X86_SIMD_SORT_UNROLL_LOOP(64) + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs / numPer; i++) { bitonic_merge_n_vec(regs + i * numPer); } @@ -70,7 +66,7 @@ X86_SIMD_SORT_UNROLL_LOOP(64) } template -X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N) { if constexpr (numVecs > 1) { if (N * 2 <= numVecs * vtype::numlanes) { @@ -80,10 +76,10 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N) } reg_t vecs[numVecs]; - + // Generate masks for loading and storing typename vtype::opmask_t ioMasks[numVecs - numVecs / 2]; -X86_SIMD_SORT_UNROLL_LOOP(64) + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) { int64_t num_to_read = std::min((int64_t)std::max(0, N - i * vtype::numlanes), @@ -91,20 +87,20 @@ X86_SIMD_SORT_UNROLL_LOOP(64) ioMasks[j] = ((0x1ull << num_to_read) - 0x1ull); } -// Unmasked part of the load -X86_SIMD_SORT_UNROLL_LOOP(64) + // Unmasked part of the load + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs / 2; i++) { vecs[i] = vtype::loadu(arr + i * vtype::numlanes); } -// Masked part of the load -X86_SIMD_SORT_UNROLL_LOOP(64) + // Masked part of the load + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) { vecs[i] = vtype::mask_loadu( vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes); } -// Sort each loaded vector -X86_SIMD_SORT_UNROLL_LOOP(64) + // Sort each loaded vector + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs; i++) { vecs[i] = vtype::sort_vec(vecs[i]); } @@ -112,19 +108,19 @@ X86_SIMD_SORT_UNROLL_LOOP(64) // Run the full merger bitonic_fullmerge_n_vec(&vecs[0]); -// Unmasked part of the store -X86_SIMD_SORT_UNROLL_LOOP(64) + // Unmasked part of the store + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs / 2; i++) { vtype::storeu(arr + i * vtype::numlanes, vecs[i]); } -// Masked part of the store -X86_SIMD_SORT_UNROLL_LOOP(64) + // Masked part of the store + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) { vtype::mask_storeu(arr + i * vtype::numlanes, ioMasks[j], vecs[i]); } } -template +template X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N) { constexpr int numVecs = maxN / vtype::numlanes; @@ -136,4 +132,4 @@ X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N) sort_n_vec(arr, N); } -#endif \ No newline at end of file +#endif diff --git a/tests/meson.build b/tests/meson.build index ac0ce341..172ddf01 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -1,31 +1,16 @@ libtests = [] -if cpp.has_argument('-march=skylake-avx512') - libtests += static_library('tests_kv', - files( - 'test-keyvalue.cpp', - 'test-argsort.cpp', - ), - dependencies: gtest_dep, - include_directories : [src, utils], - cpp_args : ['-O3', '-march=skylake-avx512'], - ) -endif +libtests += static_library('tests_qsort', + files('test-qsort.cpp', ), + dependencies: gtest_dep, + include_directories : [lib, utils], + ) -if cpp.has_argument('-march=icelake-client') - libtests += static_library('tests_qsort', - files('test-qsort.cpp', ), - dependencies: gtest_dep, - include_directories : [src, utils], - cpp_args : ['-O3', '-march=icelake-client'], - ) -endif - -if cancompilefp16 - libtests += static_library('tests_qsortfp16', - files('test-qsortfp16.cpp', ), - dependencies: gtest_dep, - include_directories : [src, utils], - cpp_args : ['-O3', '-march=sapphirerapids'], - ) -endif +#if cancompilefp16 +# libtests += static_library('tests_qsortfp16', +# files('test-qsortfp16.cpp', ), +# dependencies: gtest_dep, +# include_directories : [src, utils], +# cpp_args : ['-O3', '-march=sapphirerapids'], +# ) +#endif diff --git a/tests/test-argselect.hpp b/tests/test-argselect.hpp deleted file mode 100644 index 13506283..00000000 --- a/tests/test-argselect.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/******************************************* - * * Copyright (C) 2023 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -template -class avx512argselect : public ::testing::Test { -}; - -TYPED_TEST_SUITE_P(avx512argselect); - -TYPED_TEST_P(avx512argselect, test_random) -{ - if (__builtin_cpu_supports("avx512bw")) { - const int arrsize = 1024; - auto arr = get_uniform_rand_array(arrsize); - std::vector sorted_inx; - if (std::is_floating_point::value) { - arr[0] = std::numeric_limits::quiet_NaN(); - arr[1] = std::numeric_limits::quiet_NaN(); - } - sorted_inx = std_argsort(arr); - std::vector kth; - for (int64_t ii = 0; ii < arrsize - 3; ++ii) { - kth.push_back(ii); - } - for (auto &k : kth) { - std::vector inx - = avx512_argselect(arr.data(), k, arr.size()); - auto true_kth = arr[sorted_inx[k]]; - EXPECT_EQ(true_kth, arr[inx[k]]) << "Failed at index k = " << k; - if (k >= 1) { - EXPECT_GE(true_kth, std_max_element(arr, inx, 0, k - 1)) - << "failed at k = " << k; - } - if (k != arrsize - 1) { - EXPECT_LE(true_kth, - std_min_element(arr, inx, k + 1, arrsize - 1)) - << "failed at k = " << k; - } - EXPECT_UNIQUE(inx) - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -REGISTER_TYPED_TEST_SUITE_P(avx512argselect, test_random); diff --git a/tests/test-argsort-common.h b/tests/test-argsort-common.h deleted file mode 100644 index 543bfaec..00000000 --- a/tests/test-argsort-common.h +++ /dev/null @@ -1,81 +0,0 @@ -#include "avx512-64bit-argsort.hpp" - -#include "rand_array.h" -#include -#include -#include - -template -std::vector std_argsort(const std::vector &arr) -{ - std::vector indices(arr.size()); - std::iota(indices.begin(), indices.end(), 0); - std::sort(indices.begin(), - indices.end(), - [&arr](int64_t left, int64_t right) -> bool { - if ((!std::isnan(arr[left])) && (!std::isnan(arr[right]))) { - return arr[left] < arr[right]; - } - else if (std::isnan(arr[left])) { - return false; - } - else { - return true; - } - }); - - return indices; -} - -template -T std_min_element(std::vector arr, - std::vector arg, - int64_t left, - int64_t right) -{ - std::vector::iterator res = std::min_element( - arg.begin() + left, - arg.begin() + right, - [arr](int64_t a, int64_t b) -> bool { - if ((!std::isnan(arr[a])) && (!std::isnan(arr[b]))) { - return arr[a] < arr[b]; - } - else if (std::isnan(arr[a])) { - return false; - } - else { - return true; - } - }); - return arr[*res]; -} - -template -T std_max_element(std::vector arr, - std::vector arg, - int64_t left, - int64_t right) -{ - std::vector::iterator res = std::max_element( - arg.begin() + left, - arg.begin() + right, - [arr](int64_t a, int64_t b) -> bool { - if ((!std::isnan(arr[a])) && (!std::isnan(arr[b]))) { - return arr[a] > arr[b]; - } - else if (std::isnan(arr[a])) { - return true; - } - else { - return false; - } - }); - return arr[*res]; -} - -#define EXPECT_UNIQUE(sorted_arg) \ - std::sort(sorted_arg.begin(), sorted_arg.end()); \ - std::vector expected_arg(sorted_arg.size()); \ - std::iota(expected_arg.begin(), expected_arg.end(), 0); \ - EXPECT_EQ(sorted_arg, expected_arg) \ - << "Indices aren't unique. Array size = " << sorted_arg.size(); diff --git a/tests/test-argsort.cpp b/tests/test-argsort.cpp deleted file mode 100644 index 41ce5ca4..00000000 --- a/tests/test-argsort.cpp +++ /dev/null @@ -1,9 +0,0 @@ -#include "test-argsort-common.h" -#include "test-argsort.hpp" -#include "test-argselect.hpp" - -using ArgTestTypes - = testing::Types; - -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512argsort, ArgTestTypes); -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512argselect, ArgTestTypes); diff --git a/tests/test-argsort.hpp b/tests/test-argsort.hpp deleted file mode 100644 index 62c3de60..00000000 --- a/tests/test-argsort.hpp +++ /dev/null @@ -1,272 +0,0 @@ -/******************************************* - * * Copyright (C) 2023 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -template -class avx512argsort : public ::testing::Test { -}; -TYPED_TEST_SUITE_P(avx512argsort); - -TYPED_TEST_P(avx512argsort, test_random) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector arrsizes; - for (int64_t ii = 0; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - /* Random array */ - arr = get_uniform_rand_array(size); - std::vector inx1 = std_argsort(arr); - std::vector inx2 - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1, sort2; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx1[jj]]); - sort2.push_back(arr[inx2[jj]]); - } - EXPECT_EQ(sort1, sort2) << "Array size =" << size; - EXPECT_UNIQUE(inx2) - arr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -TYPED_TEST_P(avx512argsort, test_constant) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector arrsizes; - for (int64_t ii = 0; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - /* constant array */ - auto elem = get_uniform_rand_array(1)[0]; - for (auto jj = 0; jj < size; ++jj) { - arr.push_back(elem); - } - std::vector inx1 = std_argsort(arr); - std::vector inx2 - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1, sort2; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx1[jj]]); - sort2.push_back(arr[inx2[jj]]); - } - EXPECT_EQ(sort1, sort2) << "Array size =" << size; - EXPECT_UNIQUE(inx2) - arr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -TYPED_TEST_P(avx512argsort, test_small_range) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector arrsizes; - for (int64_t ii = 0; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - /* array with a smaller range of values */ - arr = get_uniform_rand_array(size, 20, 1); - std::vector inx1 = std_argsort(arr); - std::vector inx2 - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1, sort2; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx1[jj]]); - sort2.push_back(arr[inx2[jj]]); - } - EXPECT_EQ(sort1, sort2) << "Array size = " << size; - EXPECT_UNIQUE(inx2) - arr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -TYPED_TEST_P(avx512argsort, test_sorted) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector arrsizes; - for (int64_t ii = 0; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - std::sort(arr.begin(), arr.end()); - std::vector inx1 = std_argsort(arr); - std::vector inx2 - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1, sort2; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx1[jj]]); - sort2.push_back(arr[inx2[jj]]); - } - EXPECT_EQ(sort1, sort2) << "Array size =" << size; - EXPECT_UNIQUE(inx2) - arr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -TYPED_TEST_P(avx512argsort, test_reverse) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector arrsizes; - for (int64_t ii = 0; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - std::sort(arr.begin(), arr.end()); - std::reverse(arr.begin(), arr.end()); - std::vector inx1 = std_argsort(arr); - std::vector inx2 - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1, sort2; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx1[jj]]); - sort2.push_back(arr[inx2[jj]]); - } - EXPECT_EQ(sort1, sort2) << "Array size =" << size; - EXPECT_UNIQUE(inx2) - arr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -TYPED_TEST_P(avx512argsort, test_array_with_nan) -{ - if (!__builtin_cpu_supports("avx512bw")) { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } - if (!std::is_floating_point::value) { - GTEST_SKIP() << "Skipping this test, it is meant for float/double"; - } - std::vector arrsizes; - for (int64_t ii = 2; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - arr[0] = std::numeric_limits::quiet_NaN(); - arr[1] = std::numeric_limits::quiet_NaN(); - std::vector inx - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx[jj]]); - } - if ((!std::isnan(sort1[size - 1])) || (!std::isnan(sort1[size - 2]))) { - FAIL() << "NAN's aren't sorted to the end"; - } - if (!std::is_sorted(sort1.begin(), sort1.end() - 2)) { - FAIL() << "Array isn't sorted"; - } - EXPECT_UNIQUE(inx) - arr.clear(); - } -} - -TYPED_TEST_P(avx512argsort, test_max_value_at_end_of_array) -{ - if (!__builtin_cpu_supports("avx512bw")) { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } - std::vector arrsizes; - for (int64_t ii = 1; ii <= 256; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - if (std::numeric_limits::has_infinity) { - arr[size - 1] = std::numeric_limits::infinity(); - } - else { - arr[size - 1] = std::numeric_limits::max(); - } - std::vector inx = avx512_argsort(arr.data(), arr.size()); - std::vector sorted; - for (auto jj = 0; jj < size; ++jj) { - sorted.push_back(arr[inx[jj]]); - } - if (!std::is_sorted(sorted.begin(), sorted.end())) { - EXPECT_TRUE(false) << "Array of size " << size << "is not sorted"; - } - EXPECT_UNIQUE(inx) - arr.clear(); - } -} - -TYPED_TEST_P(avx512argsort, test_all_inf_array) -{ - if (!__builtin_cpu_supports("avx512bw")) { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } - std::vector arrsizes; - for (int64_t ii = 1; ii <= 256; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - if (std::numeric_limits::has_infinity) { - for (int64_t jj = 1; jj <= size; ++jj) { - if (rand() % 0x1) { - arr.push_back(std::numeric_limits::infinity()); - } - } - } - else { - for (int64_t jj = 1; jj <= size; ++jj) { - if (rand() % 0x1) { - arr.push_back(std::numeric_limits::max()); - } - } - } - std::vector inx = avx512_argsort(arr.data(), arr.size()); - std::vector sorted; - for (auto jj = 0; jj < size; ++jj) { - sorted.push_back(arr[inx[jj]]); - } - if (!std::is_sorted(sorted.begin(), sorted.end())) { - EXPECT_TRUE(false) << "Array of size " << size << "is not sorted"; - } - EXPECT_UNIQUE(inx) - arr.clear(); - } -} - -REGISTER_TYPED_TEST_SUITE_P(avx512argsort, - test_random, - test_reverse, - test_constant, - test_sorted, - test_small_range, - test_all_inf_array, - test_array_with_nan, - test_max_value_at_end_of_array); diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp deleted file mode 100644 index 6e75f344..00000000 --- a/tests/test-keyvalue.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/******************************************* - * * Copyright (C) 2022 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -#include "avx512-64bit-keyvaluesort.hpp" - -#include "rand_array.h" -#include -#include -#define inf X86_SIMD_SORT_INFINITY - -template -struct sorted_t { - K key; - K value; -}; - -template -bool compare(sorted_t a, sorted_t b) -{ - return a.key == b.key ? a.value < b.value : a.key < b.key; -} - -template -class KeyValueSort : public ::testing::Test { -}; - -TYPED_TEST_SUITE_P(KeyValueSort); - -TYPED_TEST_P(KeyValueSort, test_64bit_random_data) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector keysizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - keysizes.push_back((TypeParam)ii); - } - std::vector keys; - std::vector values; - std::vector> sortedarr; - - for (size_t ii = 0; ii < keysizes.size(); ++ii) { - /* Random array */ - keys = get_uniform_rand_array_with_uniquevalues( - keysizes[ii]); - values = get_uniform_rand_array(keysizes[ii]); - for (size_t i = 0; i < keys.size(); i++) { - sorted_t tmp_s; - tmp_s.key = keys[i]; - tmp_s.value = values[i]; - sortedarr.emplace_back(tmp_s); - } - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), - sortedarr.end(), - compare); - avx512_qsort_kv(keys.data(), values.data(), keys.size()); - for (size_t i = 0; i < keys.size(); i++) { - ASSERT_EQ(keys[i], sortedarr[i].key); - ASSERT_EQ(values[i], sortedarr[i].value); - } - keys.clear(); - values.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TEST(KeyValueSort, test_inf_at_endofarray) -{ - std::vector key = {8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, inf}; - std::vector key_sorted - = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, inf}; - std::vector val = {7, 6, 5, 4, 3, 2, 1, 0, 8}; - std::vector val_sorted = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - avx512_qsort_kv(key.data(), val.data(), key.size()); - ASSERT_EQ(key, key_sorted); - ASSERT_EQ(val, val_sorted); -} - -REGISTER_TYPED_TEST_SUITE_P(KeyValueSort, test_64bit_random_data); - -using TypesKv = testing::Types; -INSTANTIATE_TYPED_TEST_SUITE_P(T, KeyValueSort, TypesKv); diff --git a/tests/test-partial-qsort.hpp b/tests/test-partial-qsort.hpp deleted file mode 100644 index fee3d9f3..00000000 --- a/tests/test-partial-qsort.hpp +++ /dev/null @@ -1,51 +0,0 @@ -#include "test-qsort-common.h" - -template -class avx512_partial_sort : public ::testing::Test { -}; -TYPED_TEST_SUITE_P(avx512_partial_sort); - -TYPED_TEST_P(avx512_partial_sort, test_ranges) -{ - int64_t arrsize = 1024; - int64_t nranges = 500; - - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arr; - std::vector sortedarr; - std::vector psortedarr; - /* Random array */ - arr = get_uniform_rand_array(arrsize); - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - - for (auto ii = 0; ii < nranges; ++ii) { - psortedarr = arr; - - /* Pick a random number of elements to sort at the beginning of the array */ - int k = get_uniform_rand_array(1, arrsize, 1).front(); - - /* Sort the range and verify all the required elements match the presorted set */ - avx512_partial_qsort( - psortedarr.data(), k, psortedarr.size()); - for (auto jj = 0; jj < k; jj++) { - ASSERT_EQ(sortedarr[jj], psortedarr[jj]); - } - - psortedarr.clear(); - } - - arr.clear(); - sortedarr.clear(); - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -REGISTER_TYPED_TEST_SUITE_P(avx512_partial_sort, test_ranges); diff --git a/tests/test-qselect.hpp b/tests/test-qselect.hpp deleted file mode 100644 index b35d5486..00000000 --- a/tests/test-qselect.hpp +++ /dev/null @@ -1,112 +0,0 @@ -#include "test-qsort-common.h" - -template -class avx512_select : public ::testing::Test { -}; -TYPED_TEST_SUITE_P(avx512_select); - -#ifdef __FLT16_MAX__ -TEST(avx512_select, test_simple) -{ - if (__builtin_cpu_supports("avx512vbmi2")) { - std::vector<_Float16> arr{1.0, -1.0}; - avx512_qselect_fp16(reinterpret_cast(arr.data()), 0, arr.size()); - ASSERT_EQ(arr[0], -1.0); - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512vbmi2"; - } -} -#endif - -TYPED_TEST_P(avx512_select, test_random) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - std::vector sortedarr; - std::vector psortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* Random array */ - arr = get_uniform_rand_array(arrsizes[ii]); - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - for (size_t k = 0; k < arr.size(); ++k) { - psortedarr = arr; - avx512_qselect( - psortedarr.data(), k, psortedarr.size()); - /* index k is correct */ - ASSERT_EQ(sortedarr[k], psortedarr[k]); - /* Check left partition */ - for (size_t jj = 0; jj < k; jj++) { - ASSERT_LE(psortedarr[jj], psortedarr[k]); - } - /* Check right partition */ - for (size_t jj = k + 1; jj < arr.size(); jj++) { - ASSERT_GE(psortedarr[jj], psortedarr[k]); - } - psortedarr.clear(); - } - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TYPED_TEST_P(avx512_select, test_small_range) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - std::vector sortedarr; - std::vector psortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* Random array */ - arr = get_uniform_rand_array(arrsizes[ii], 20, 1); - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - for (size_t k = 0; k < arr.size(); ++k) { - psortedarr = arr; - avx512_qselect( - psortedarr.data(), k, psortedarr.size()); - /* index k is correct */ - ASSERT_EQ(sortedarr[k], psortedarr[k]); - /* Check left partition */ - for (size_t jj = 0; jj < k; jj++) { - ASSERT_LE(psortedarr[jj], psortedarr[k]); - } - /* Check right partition */ - for (size_t jj = k + 1; jj < arr.size(); jj++) { - ASSERT_GE(psortedarr[jj], psortedarr[k]); - } - psortedarr.clear(); - } - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_random, test_small_range); diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h index 9690265a..6b8241b3 100644 --- a/tests/test-qsort-common.h +++ b/tests/test-qsort-common.h @@ -1,11 +1,101 @@ #ifndef AVX512_TEST_COMMON #define AVX512_TEST_COMMON -#include "avx512-16bit-qsort.hpp" -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-qsort.hpp" - +#include "custom-compare.h" #include "rand_array.h" +#include "x86simdsort.h" #include +#define EXPECT_UNIQUE(arg) \ + auto sorted_arg = arg; \ + std::sort(sorted_arg.begin(), sorted_arg.end()); \ + std::vector expected_arg(sorted_arg.size()); \ + std::iota(expected_arg.begin(), expected_arg.end(), 0); \ + EXPECT_EQ(sorted_arg, expected_arg) \ + << "Indices aren't unique. Array size = " << sorted_arg.size(); + +#define REPORT_FAIL(msg, size, type, k) \ + ASSERT_TRUE(false) << msg << ". arr size = " << size \ + << ", type = " << type << ", k = " << k; + +template +void IS_SORTED(std::vector sorted, std::vector arr, std::string type) +{ + if (memcmp(arr.data(), sorted.data(), arr.size() * sizeof(T) != 0)) { + REPORT_FAIL("Array not sorted", arr.size(), type, -1); + } +} + +template +void IS_ARG_SORTED(std::vector sortedarr, + std::vector arr, + std::vector arg, + std::string type) +{ + EXPECT_UNIQUE(arg) + std::vector arr_backup; + for (auto ii : arg) { + arr_backup.push_back(arr[ii]); + } + IS_SORTED(sortedarr, arr_backup, type); +} + +template +void IS_ARR_PARTITIONED(std::vector arr, + size_t k, + T true_kth, + std::string type) +{ + auto cmp_eq = compare>(); + auto cmp_less = compare>(); + auto cmp_leq = compare>(); + auto cmp_geq = compare>(); + + // 1) arr[k] == sorted[k]; use memcmp to handle nan + if (!cmp_eq(arr[k], true_kth)) { + REPORT_FAIL("kth element is incorrect", arr.size(), type, k); + } + // ( 2) Elements to the left of k should be atmost arr[k] + if (k >= 1) { + T max_left + = *std::max_element(arr.begin(), arr.begin() + k - 1, cmp_less); + if (!cmp_geq(arr[k], max_left)) { + REPORT_FAIL("incorrect left partition", arr.size(), type, k); + } + } + // 3) Elements to the right of k should be atleast arr[k] + if (k != (size_t)(arr.size() - 1)) { + T min_right + = *std::min_element(arr.begin() + k + 1, arr.end(), cmp_less); + if (!cmp_leq(arr[k], min_right)) { + REPORT_FAIL("incorrect right partition", arr.size(), type, k); + } + } +} + +template +void IS_ARR_PARTIALSORTED(std::vector arr, + size_t k, + std::vector sorted, + std::string type) +{ + if (memcmp(arr.data(), sorted.data(), k * sizeof(T)) != 0) { + REPORT_FAIL("Partial array not sorted", arr.size(), type, k); + } +} + +template +void IS_ARG_PARTITIONED(std::vector arr, + std::vector arg, + T true_kth, + size_t k, + std::string type) +{ + EXPECT_UNIQUE(arg) + std::vector part_arr; + for (auto ii : arg) { + part_arr.push_back(arr[ii]); + } + IS_ARR_PARTITIONED(part_arr, k, true_kth, type); +} #endif diff --git a/tests/test-qsort-fp.hpp b/tests/test-qsort-fp.hpp deleted file mode 100644 index 8309d509..00000000 --- a/tests/test-qsort-fp.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/******************************************* - * * Copyright (C) 2022 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -#include "test-qsort-common.h" - -template -class avx512_sort_fp : public ::testing::Test { -}; -TYPED_TEST_SUITE_P(avx512_sort_fp); - -TYPED_TEST_P(avx512_sort_fp, test_random_nan) -{ - const int num_nans = 3; - if (!__builtin_cpu_supports("avx512bw")) { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } - std::vector arrsizes; - for (int64_t ii = num_nans; ii < 1024; ++ii) { - arrsizes.push_back((TypeParam)ii); - } - std::vector arr; - std::vector sortedarr; - for (auto &size : arrsizes) { - /* Random array */ - arr = get_uniform_rand_array(size); - for (auto ii = 1; ii <= num_nans; ++ii) { - arr[size - ii] = std::numeric_limits::quiet_NaN(); - } - sortedarr = arr; - std::sort(sortedarr.begin(), sortedarr.end() - 3); - std::random_shuffle(arr.begin(), arr.end()); - avx512_qsort(arr.data(), arr.size()); - for (auto ii = 1; ii <= num_nans; ++ii) { - if (!std::isnan(arr[size - ii])) { - ASSERT_TRUE(false) - << "NAN's aren't sorted to the end. Arr size = " - << size; - } - } - if (!std::is_sorted(arr.begin(), arr.end() - num_nans)) { - ASSERT_TRUE(true) << "Array isn't sorted"; - } - arr.clear(); - sortedarr.clear(); - } -} -REGISTER_TYPED_TEST_SUITE_P(avx512_sort_fp, test_random_nan); diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index a35d8e8c..7ecd1a13 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -1,10 +1,169 @@ -#include "test-qsort.hpp" -#include "test-partial-qsort.hpp" -#include "test-qselect.hpp" -#include "test-qsort-fp.hpp" +/******************************************* + * * Copyright (C) 2022 Intel Corporation + * * SPDX-License-Identifier: BSD-3-Clause + * *******************************************/ + +#include "test-qsort-common.h" + +template +class simdsort : public ::testing::Test { +public: + simdsort() + { + std::iota(arrsize.begin(), arrsize.end(), 1); + arrtype = {"random", + "constant", + "sorted", + "reverse", + "smallrange", + "max_at_the_end", + "rand_max", + "rand_with_nan"}; + } + std::vector arrtype; + std::vector arrsize = std::vector(1024); +}; + +TYPED_TEST_SUITE_P(simdsort); + +TYPED_TEST_P(simdsort, test_qsort) +{ + for (auto type : this->arrtype) { + for (auto size : this->arrsize) { + std::vector arr = get_array(type, size); + std::vector sortedarr = arr; + std::sort(sortedarr.begin(), + sortedarr.end(), + compare>()); + x86simdsort::qsort(arr.data(), arr.size()); + IS_SORTED(sortedarr, arr, type); + arr.clear(); + sortedarr.clear(); + } + } +} + +TYPED_TEST_P(simdsort, test_argsort) +{ + for (auto type : this->arrtype) { + for (auto size : this->arrsize) { + std::vector arr = get_array(type, size); + std::vector sortedarr = arr; + std::sort(sortedarr.begin(), + sortedarr.end(), + compare>()); + auto arg = x86simdsort::argsort(arr.data(), arr.size()); + IS_ARG_SORTED(sortedarr, arr, arg, type); + arr.clear(); + arg.clear(); + } + } +} + +TYPED_TEST_P(simdsort, test_qselect) +{ + for (auto type : this->arrtype) { + for (auto size : this->arrsize) { + size_t k = rand() % size; + std::vector arr = get_array(type, size); + std::vector sortedarr = arr; + std::nth_element(sortedarr.begin(), + sortedarr.begin() + k, + sortedarr.end(), + compare>()); + x86simdsort::qselect(arr.data(), k, arr.size(), true); + IS_ARR_PARTITIONED(arr, k, sortedarr[k], type); + arr.clear(); + sortedarr.clear(); + } + } +} + +TYPED_TEST_P(simdsort, test_argselect) +{ + for (auto type : this->arrtype) { + for (auto size : this->arrsize) { + size_t k = rand() % size; + std::vector arr = get_array(type, size); + std::vector sortedarr = arr; + std::sort(sortedarr.begin(), + sortedarr.end(), + compare>()); + auto arg = x86simdsort::argselect(arr.data(), k, arr.size()); + auto arg1 = x86simdsort::argsort(arr.data(), arr.size()); + IS_ARG_PARTITIONED(arr, arg, sortedarr[k], k, type); + arr.clear(); + sortedarr.clear(); + } + } +} + +TYPED_TEST_P(simdsort, test_partial_qsort) +{ + for (auto type : this->arrtype) { + for (auto size : this->arrsize) { + // k should be at least 1 + size_t k = std::max(0x1ul, rand() % size); + std::vector arr = get_array(type, size); + std::vector sortedarr = arr; + std::sort(sortedarr.begin(), + sortedarr.end(), + compare>()); + x86simdsort::partial_qsort(arr.data(), k, arr.size(), true); + IS_ARR_PARTIALSORTED(arr, k, sortedarr, type); + arr.clear(); + sortedarr.clear(); + } + } +} + +TYPED_TEST_P(simdsort, test_comparator) +{ + if constexpr (xss::fp::is_floating_point_v) { + auto less = compare>(); + auto leq = compare>(); + auto greater = compare>(); + auto geq = compare>(); + auto equal = compare>(); + TypeParam nan = xss::fp::quiet_NaN(); + TypeParam inf = xss::fp::infinity(); + ASSERT_EQ(less(nan, inf), false); + ASSERT_EQ(less(nan, nan), false); + ASSERT_EQ(less(inf, nan), true); + ASSERT_EQ(less(inf, inf), false); + ASSERT_EQ(leq(nan, inf), false); + ASSERT_EQ(leq(nan, nan), true); + ASSERT_EQ(leq(inf, nan), true); + ASSERT_EQ(leq(inf, inf), true); + ASSERT_EQ(geq(nan, inf), true); + ASSERT_EQ(geq(nan, nan), true); + ASSERT_EQ(geq(inf, nan), false); + ASSERT_EQ(geq(inf, inf), true); + ASSERT_EQ(greater(nan, inf), true); + ASSERT_EQ(greater(nan, nan), false); + ASSERT_EQ(greater(inf, nan), false); + ASSERT_EQ(greater(inf, inf), false); + ASSERT_EQ(equal(nan, inf), false); + ASSERT_EQ(equal(nan, nan), true); + ASSERT_EQ(equal(inf, nan), false); + ASSERT_EQ(equal(inf, inf), true); + } +} + +REGISTER_TYPED_TEST_SUITE_P(simdsort, + test_qsort, + test_argsort, + test_argselect, + test_qselect, + test_partial_qsort, + test_comparator); using QSortTestTypes = testing::Types= 13 + _Float16, +#endif float, double, uint32_t, @@ -12,9 +171,4 @@ using QSortTestTypes = testing::Types; -using QSortTestFPTypes = testing::Types; - -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512_sort, QSortTestTypes); -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512_sort_fp, QSortTestFPTypes); -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512_select, QSortTestTypes); -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512_partial_sort, QSortTestTypes); +INSTANTIATE_TYPED_TEST_SUITE_P(xss, simdsort, QSortTestTypes); diff --git a/tests/test-qsort.hpp b/tests/test-qsort.hpp deleted file mode 100644 index d6c1d85a..00000000 --- a/tests/test-qsort.hpp +++ /dev/null @@ -1,172 +0,0 @@ -/******************************************* - * * Copyright (C) 2022 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -#include "test-qsort-common.h" - -template -class avx512_sort : public ::testing::Test { -}; -TYPED_TEST_SUITE_P(avx512_sort); - -TYPED_TEST_P(avx512_sort, test_random) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back((TypeParam)ii); - } - std::vector arr; - std::vector sortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* Random array */ - arr = get_uniform_rand_array(arrsizes[ii]); - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - avx512_qsort(arr.data(), arr.size()); - ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii]; - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TYPED_TEST_P(avx512_sort, test_reverse) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back((TypeParam)(ii + 1)); - } - std::vector arr; - std::vector sortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* reverse array */ - for (int jj = 0; jj < arrsizes[ii]; ++jj) { - arr.push_back((TypeParam)(arrsizes[ii] - jj)); - } - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - avx512_qsort(arr.data(), arr.size()); - ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii]; - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TYPED_TEST_P(avx512_sort, test_constant) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back((TypeParam)(ii + 1)); - } - std::vector arr; - std::vector sortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* constant array */ - for (int jj = 0; jj < arrsizes[ii]; ++jj) { - arr.push_back(ii); - } - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - avx512_qsort(arr.data(), arr.size()); - ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii]; - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TYPED_TEST_P(avx512_sort, test_small_range) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back((TypeParam)(ii + 1)); - } - std::vector arr; - std::vector sortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - arr = get_uniform_rand_array(arrsizes[ii], 20, 1); - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - avx512_qsort(arr.data(), arr.size()); - ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii]; - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TYPED_TEST_P(avx512_sort, test_max_value_at_end_of_array) -{ - if (!__builtin_cpu_supports("avx512bw")) { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } - if ((sizeof(TypeParam) == 2) && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 1; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - std::vector sortedarr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - if (std::numeric_limits::has_infinity) { - arr[size - 1] = std::numeric_limits::infinity(); - } - else { - arr[size - 1] = std::numeric_limits::max(); - } - sortedarr = arr; - avx512_qsort(arr.data(), arr.size()); - std::sort(sortedarr.begin(), sortedarr.end()); - EXPECT_EQ(sortedarr, arr) << "Array size = " << size; - arr.clear(); - sortedarr.clear(); - } -} - -REGISTER_TYPED_TEST_SUITE_P(avx512_sort, - test_random, - test_reverse, - test_constant, - test_small_range, - test_max_value_at_end_of_array); diff --git a/tests/test-qsortfp16.cpp b/tests/test-qsortfp16.cpp deleted file mode 100644 index d1bd985a..00000000 --- a/tests/test-qsortfp16.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/******************************************* - * * Copyright (C) 2022 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -#include "avx512fp16-16bit-qsort.hpp" - -#include "rand_array.h" -#include -#include - -TEST(avx512_qsort_float16, test_arrsizes) -{ - if (__builtin_cpu_supports("avx512fp16")) { - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector<_Float16> arr; - std::vector<_Float16> sortedarr; - - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* Random array */ - for (auto jj = 0; jj < arrsizes[ii]; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - sortedarr.push_back(temp); - } - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - avx512_qsort<_Float16>(arr.data(), arr.size()); - ASSERT_EQ(sortedarr, arr); - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; - } -} - -TEST(avx512_qsort_float16, test_special_floats) -{ - if (__builtin_cpu_supports("avx512fp16")) { - const int arrsize = 1111; - std::vector<_Float16> arr; - std::vector<_Float16> sortedarr; - Fp16Bits temp; - for (size_t jj = 0; jj < arrsize; ++jj) { - temp.f_ = (float)rand() / (float)(RAND_MAX); - switch (rand() % 10) { - case 0: temp.i_ = 0xFFFF; break; - case 1: temp.i_ = X86_SIMD_SORT_INFINITYH; break; - case 2: temp.i_ = X86_SIMD_SORT_NEGINFINITYH; break; - default: break; - } - arr.push_back(temp.f_); - sortedarr.push_back(temp.f_); - } - /* Cannot use std::sort because it treats NAN differently */ - avx512_qsort_fp16(reinterpret_cast(sortedarr.data()), - sortedarr.size()); - avx512_qsort<_Float16>(arr.data(), arr.size()); - // Cannot rely on ASSERT_EQ since it returns false if there are NAN's - if (memcmp(arr.data(), sortedarr.data(), arrsize * 2) != 0) { - ASSERT_EQ(sortedarr, arr); - } - arr.clear(); - sortedarr.clear(); - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; - } -} - -TEST(avx512_qselect_float16, test_arrsizes) -{ - if (__builtin_cpu_supports("avx512fp16")) { - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector<_Float16> arr; - std::vector<_Float16> sortedarr; - std::vector<_Float16> psortedarr; - - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* Random array */ - for (auto jj = 0; jj < arrsizes[ii]; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - sortedarr.push_back(temp); - } - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - for (size_t k = 0; k < arr.size(); ++k) { - psortedarr = arr; - avx512_qselect<_Float16>( - psortedarr.data(), k, psortedarr.size()); - /* index k is correct */ - ASSERT_EQ(sortedarr[k], psortedarr[k]); - /* Check left partition */ - for (size_t jj = 0; jj < k; jj++) { - ASSERT_LE(psortedarr[jj], psortedarr[k]); - } - /* Check right partition */ - for (size_t jj = k + 1; jj < arr.size(); jj++) { - ASSERT_GE(psortedarr[jj], psortedarr[k]); - } - psortedarr.clear(); - } - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; - } -} - -TEST(avx512_partial_qsort_float16, test_ranges) -{ - if (__builtin_cpu_supports("avx512fp16")) { - int64_t arrsize = 1024; - int64_t nranges = 500; - - std::vector<_Float16> arr; - std::vector<_Float16> sortedarr; - std::vector<_Float16> psortedarr; - - /* Random array */ - for (auto ii = 0; ii < arrsize; ++ii) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - sortedarr.push_back(temp); - } - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - - for (auto ii = 0; ii < nranges; ++ii) { - psortedarr = arr; - - int k = get_uniform_rand_array(1, arrsize, 1).front(); - - /* Sort the range and verify all the required elements match the presorted set */ - avx512_partial_qsort<_Float16>( - psortedarr.data(), k, psortedarr.size()); - for (auto jj = 0; jj < k; jj++) { - ASSERT_EQ(sortedarr[jj], psortedarr[jj]); - } - - psortedarr.clear(); - } - - arr.clear(); - sortedarr.clear(); - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; - } -} diff --git a/utils/custom-compare.h b/utils/custom-compare.h new file mode 100644 index 00000000..d99f0491 --- /dev/null +++ b/utils/custom-compare.h @@ -0,0 +1,44 @@ +#include +#include +#include "custom-float.h" + +/* + * Custom comparator class to handle NAN's: treats NAN > INF + */ +template +struct compare { + static constexpr auto op = Comparator {}; + bool operator()(const T a, const T b) + { + if constexpr (xss::fp::is_floating_point_v) { + T inf = xss::fp::infinity(); + T one = (T) 1.0; + if (!xss::fp::isunordered(a, b)) { return op(a, b); } + else if ((xss::fp::isnan(a)) && (!xss::fp::isnan(b))) { + return b == inf ? op(inf, one) : op(inf, b); + } + else if ((!xss::fp::isnan(a)) && (xss::fp::isnan(b))) { + return a == inf ? op(one, inf) : op(a, inf); + } + else { + return op(one, one); + } + } + else { + return op(a, b); + } + } +}; + +template +struct compare_arg { + compare_arg(const T* arr) + { + this->arr = arr; + } + bool operator()(const int64_t a, const int64_t b) + { + return compare()(arr[a], arr[b]); + } + const T* arr; +}; diff --git a/utils/custom-float.h b/utils/custom-float.h new file mode 100644 index 00000000..001d4245 --- /dev/null +++ b/utils/custom-float.h @@ -0,0 +1,91 @@ +#ifndef UTILS_FLOAT +#define UTILS_FLOAT +#include +namespace xss { +namespace fp +{ + template + inline constexpr bool is_floating_point_v = std::is_floating_point_v; + + template + bool isnan(T elem) + { + return std::isnan(elem); + } + template + bool isunordered(T a, T b) + { + return std::isunordered(a, b); + } + template + T max() + { + return std::numeric_limits::max(); + } + template + T min() + { + return std::numeric_limits::min(); + } + template + T infinity() + { + return std::numeric_limits::infinity(); + } + template + T quiet_NaN() + { + return std::numeric_limits::quiet_NaN(); + } + +#ifdef __FLT16_MAX__ + typedef union { + _Float16 f_; + uint16_t i_; + } Fp16Bits; + + _Float16 convert_bits(uint16_t val) + { + Fp16Bits temp; + temp.i_ = val; + return temp.f_; + } + + template <> + inline constexpr bool is_floating_point_v<_Float16> = true; + + template <> + bool isnan<_Float16>(_Float16 elem) + { + return elem != elem; + } + template <> + bool isunordered<_Float16>(_Float16 a, _Float16 b) + { + return isnan(a) || isnan(b); + } + template <> + _Float16 max<_Float16>() + { + return convert_bits(0x7bff); + } + template <> + _Float16 min<_Float16>() + { + return convert_bits(0x0400); + } + template <> + _Float16 infinity<_Float16>() + { + return convert_bits(0x7c00); + } + template <> + _Float16 quiet_NaN<_Float16>() + { + return convert_bits(0x7c01); + } +#endif + +} // namespace float +} // namespace xss +#endif diff --git a/utils/rand_array.h b/utils/rand_array.h index a780f50d..562c67bf 100644 --- a/utils/rand_array.h +++ b/utils/rand_array.h @@ -2,44 +2,48 @@ * * Copyright (C) 2022 Intel Corporation * * SPDX-License-Identifier: BSD-3-Clause * *******************************************/ +#ifndef UTILS_RAND_ARRAY +#define UTILS_RAND_ARRAY #include #include #include #include +#include +#include "custom-float.h" template static std::vector get_uniform_rand_array( int64_t arrsize, - T max = std::numeric_limits::max(), - T min = std::numeric_limits::min(), - typename std::enable_if::value>::type * = 0) + T max = xss::fp::max(), + T min = xss::fp::min()) { std::vector arr; - std::random_device r; - std::default_random_engine e1(r()); - e1.seed(42); - std::uniform_int_distribution uniform_dist(min, max); - for (int64_t ii = 0; ii < arrsize; ++ii) { - arr.emplace_back(uniform_dist(e1)); - } - return arr; -} - -template -static std::vector get_uniform_rand_array( - int64_t arrsize, - T max = std::numeric_limits::max(), - T min = std::numeric_limits::min(), - typename std::enable_if::value>::type * = 0) -{ std::random_device rd; - std::mt19937 gen(rd()); - gen.seed(42); - std::uniform_real_distribution dis(min, max); - std::vector arr; - for (int64_t ii = 0; ii < arrsize; ++ii) { - arr.emplace_back(dis(gen)); + if constexpr(std::is_floating_point_v) { + std::mt19937 gen(rd()); + gen.seed(42); + std::uniform_real_distribution dis(min, max); + for (int64_t ii = 0; ii < arrsize; ++ii) { + arr.emplace_back(dis(gen)); + } + } +#ifdef __FLT16_MAX__ + else if constexpr(std::is_same_v) { + (void)(max); (void)(min); + for (auto jj = 0; jj < arrsize; ++jj) { + float temp = (float)rand() / (float)(RAND_MAX); + arr.push_back((_Float16)temp); + } + } +#endif + else if constexpr(std::is_integral_v) { + std::default_random_engine e1(rd()); + e1.seed(42); + std::uniform_int_distribution uniform_dist(min, max); + for (int64_t ii = 0; ii < arrsize; ++ii) { + arr.emplace_back(uniform_dist(e1)); + } } return arr; } @@ -47,8 +51,8 @@ static std::vector get_uniform_rand_array( template static std::vector get_uniform_rand_array_with_uniquevalues(int64_t arrsize, - T max = std::numeric_limits::max(), - T min = std::numeric_limits::min()) + T max = xss::fp::max(), + T min = xss::fp::min()) { std::vector arr = get_uniform_rand_array(arrsize, max, min); typename std::vector::iterator ip @@ -56,3 +60,78 @@ get_uniform_rand_array_with_uniquevalues(int64_t arrsize, arr.resize(std::distance(arr.begin(), ip)); return arr; } + +template +static std::vector +get_array(std::string arrtype, + int64_t arrsize, + T min = xss::fp::min(), + T max = xss::fp::max()) +{ + std::vector arr; + if (arrtype == "random") { arr = get_uniform_rand_array(arrsize, max, min); } + else if (arrtype == "sorted") { + arr = get_uniform_rand_array(arrsize, max, min); + std::sort(arr.begin(), arr.end()); + } + else if (arrtype == "constant") { + T temp = get_uniform_rand_array(1, max, min)[0]; + for (auto ii = 0; ii < arrsize; ++ii) { + arr.push_back(temp); + } + } + else if (arrtype == "reverse") { + arr = get_uniform_rand_array(arrsize, max, min); + std::sort(arr.begin(), arr.end()); + std::reverse(arr.begin(), arr.end()); + } + else if (arrtype == "smallrange") { + arr = get_uniform_rand_array(arrsize, 10, 1); + } + else if (arrtype == "max_at_the_end") { + arr = get_uniform_rand_array(arrsize, max, min); + if (xss::fp::is_floating_point_v) { + arr[arrsize - 1] = xss::fp::infinity(); + } + else { + arr[arrsize - 1] = std::numeric_limits::max(); + } + } + else if (arrtype == "rand_with_nan") { + arr = get_uniform_rand_array(arrsize, max, min); + int64_t num_nans = 10 % arrsize; + std::vector rand_indx + = get_uniform_rand_array(num_nans, arrsize-1, 0); + T val; + if constexpr (xss::fp::is_floating_point_v) { + val = xss::fp::quiet_NaN(); + } + else { + val = std::numeric_limits::max(); + } + for (auto ind : rand_indx) { + arr[ind] = val; + } + } + else if (arrtype == "rand_max") { + arr = get_uniform_rand_array(arrsize, max, min); + T val; + if constexpr (xss::fp::is_floating_point_v) { + val = xss::fp::infinity(); + } + else { + val = std::numeric_limits::max(); + } + for (auto ii = 1; ii <= arrsize; ++ii) { + if (rand() % 0x1) { + arr[ii] = val; + } + } + } + else { + std::cout << "Warning: unrecognized array type " << arrtype << std::endl; + } + return arr; +} + +#endif // UTILS_RAND_ARRAY