diff --git a/examples/Makefile b/examples/Makefile index 80917c1b..7694bcc1 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -1,30 +1,24 @@ -CXX ?= g++-12 -CFLAGS = -I../src -std=c++17 -O3 $(if $(CXXFLAGS),$(CXXFLAGS),) -EXE = qsort32avx2 argsort kvsort qsortfp16 qsort16 qsort32 qsort64 +CXX ?= g++-13 +CFLAGS = -I../src -std=c++17 -O3 +EXE = kvsort qsortavx2 qsortavx512 qsortspr qsorticl default: all all : $(EXE) -qsortfp16: avx512fp-16bit-qsort.cpp - $(CXX) -o qsortfp16 -march=sapphirerapids $(CFLAGS) avx512fp-16bit-qsort.cpp - -qsort16: avx512-16bit-qsort.cpp - $(CXX) -o qsort16 -march=icelake-client $(CFLAGS) avx512-16bit-qsort.cpp - -qsort32: avx512-32bit-qsort.cpp - $(CXX) -o qsort32 -march=skylake-avx512 $(CFLAGS) avx512-32bit-qsort.cpp +kvsort: avx512-kv.cpp + $(CXX) -o kvsort -mavx512vl -mavx512dq $(CFLAGS) avx512-kv.cpp -qsort32avx2: avx2-32bit-qsort.cpp - $(CXX) -o qsort32avx2 -march=haswell $(CFLAGS) avx2-32bit-qsort.cpp +qsortavx512: skx-avx2.cpp + $(CXX) -o qsortavx512 -mavx512vl -mavx512dq $(CFLAGS) skx-avx2.cpp -qsort64: avx512-64bit-qsort.cpp - $(CXX) -o qsort64 -march=skylake-avx512 $(CFLAGS) avx512-64bit-qsort.cpp +qsortavx2: skx-avx2.cpp + $(CXX) -o qsortavx2 -mavx2 $(CFLAGS) skx-avx2.cpp -argsort: avx512-argsort.cpp - $(CXX) -o argsort -march=skylake-avx512 $(CFLAGS) avx512-argsort.cpp +qsorticl: icl-16bit.cpp + $(CXX) -o qsorticl -mavx512vl -mavx512bw -mavx512dq -mavx512vbmi2 $(CFLAGS) icl-16bit.cpp -kvsort: avx512-kv.cpp - $(CXX) -o kvsort -march=skylake-avx512 $(CFLAGS) avx512-kv.cpp +qsortspr: spr-16bit.cpp + $(CXX) -o qsortspr -mavx512vl -mavx512dq -mavx512vbmi2 -mavx512fp16 $(CFLAGS) spr-16bit.cpp clean: $(RM) $(EXE) diff --git a/examples/avx2-32bit-qsort.cpp b/examples/avx2-32bit-qsort.cpp deleted file mode 100644 index 5e36aa22..00000000 --- a/examples/avx2-32bit-qsort.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "avx2-32bit-qsort.hpp" - -int main() -{ - const int size = 1000; - float arr[size]; - avx2_qsort(arr, size); - avx2_qselect(arr, 10, size); - avx2_partial_qsort(arr, 10, size); - return 0; -} diff --git a/examples/avx512-16bit-qsort.cpp b/examples/avx512-16bit-qsort.cpp deleted file mode 100644 index 9990402b..00000000 --- a/examples/avx512-16bit-qsort.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "avx512-16bit-qsort.hpp" - -int main() -{ - const int size = 1000; - short arr[size]; - avx512_qsort(arr, size); - avx512_qselect(arr, 10, size); - avx512_partial_qsort(arr, 10, size); - return 0; -} diff --git a/examples/avx512-32bit-qsort.cpp b/examples/avx512-32bit-qsort.cpp deleted file mode 100644 index 8d8b8b7a..00000000 --- a/examples/avx512-32bit-qsort.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "avx512-32bit-qsort.hpp" - -int main() -{ - const int size = 1000; - float arr[size]; - avx512_qsort(arr, size); - avx512_qselect(arr, 10, size); - avx512_partial_qsort(arr, 10, size); - return 0; -} diff --git a/examples/avx512-64bit-qsort.cpp b/examples/avx512-64bit-qsort.cpp deleted file mode 100644 index 400f860a..00000000 --- a/examples/avx512-64bit-qsort.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "avx512-64bit-qsort.hpp" - -int main() -{ - const int size = 1000; - double arr[size]; - avx512_qsort(arr, size); - avx512_qselect(arr, 10, size); - avx512_partial_qsort(arr, 10, size); - return 0; -} diff --git a/examples/avx512-argsort.cpp b/examples/avx512-argsort.cpp deleted file mode 100644 index cbe21066..00000000 --- a/examples/avx512-argsort.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include "avx512-64bit-argsort.hpp" - -int main() -{ - const int size = 1000; - float arr[size]; - std::vector arg1 = avx512_argsort(arr, size); - std::vector arg2 = avx512_argselect(arr, 10, size); - return 0; -} diff --git a/examples/avx512-kv.cpp b/examples/avx512-kv.cpp index f46a1020..3ca6d090 100644 --- a/examples/avx512-kv.cpp +++ b/examples/avx512-kv.cpp @@ -1,4 +1,4 @@ -#include "avx512-64bit-keyvaluesort.hpp" +#include "x86simdsort-static-incl.h" int main() { @@ -7,17 +7,17 @@ int main() uint64_t arr2[size]; double arr3[size]; float arr4[size]; - avx512_qsort_kv(arr1, arr1, size); - avx512_qsort_kv(arr1, arr2, size); - avx512_qsort_kv(arr1, arr3, size); - avx512_qsort_kv(arr2, arr1, size); - avx512_qsort_kv(arr2, arr2, size); - avx512_qsort_kv(arr2, arr3, size); - avx512_qsort_kv(arr3, arr1, size); - avx512_qsort_kv(arr3, arr2, size); - avx512_qsort_kv(arr1, arr4, size); - avx512_qsort_kv(arr2, arr4, size); - avx512_qsort_kv(arr3, arr4, size); + x86simdsortStatic::keyvalue_qsort(arr1, arr1, size); + x86simdsortStatic::keyvalue_qsort(arr1, arr2, size); + x86simdsortStatic::keyvalue_qsort(arr1, arr3, size); + x86simdsortStatic::keyvalue_qsort(arr2, arr1, size); + x86simdsortStatic::keyvalue_qsort(arr2, arr2, size); + x86simdsortStatic::keyvalue_qsort(arr2, arr3, size); + x86simdsortStatic::keyvalue_qsort(arr3, arr1, size); + x86simdsortStatic::keyvalue_qsort(arr3, arr2, size); + x86simdsortStatic::keyvalue_qsort(arr1, arr4, size); + x86simdsortStatic::keyvalue_qsort(arr2, arr4, size); + x86simdsortStatic::keyvalue_qsort(arr3, arr4, size); return 0; return 0; } diff --git a/examples/avx512fp-16bit-qsort.cpp b/examples/avx512fp-16bit-qsort.cpp deleted file mode 100644 index 18e1c823..00000000 --- a/examples/avx512fp-16bit-qsort.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "avx512fp16-16bit-qsort.hpp" - -int main() -{ - const int size = 1000; - _Float16 arr[size]; - avx512_qsort(arr, size); - avx512_qselect(arr, 10, size); - avx512_partial_qsort(arr, 10, size); - return 0; -} diff --git a/examples/icl-16bit.cpp b/examples/icl-16bit.cpp new file mode 100644 index 00000000..e789b0f4 --- /dev/null +++ b/examples/icl-16bit.cpp @@ -0,0 +1,11 @@ +#include "x86simdsort-static-incl.h" + +int main() +{ + const int size = 1000; + short arr[size]; + x86simdsortStatic::qsort(arr, size); + x86simdsortStatic::qselect(arr, 10, size); + x86simdsortStatic::partial_qsort(arr, 10, size); + return 0; +} diff --git a/examples/skx-avx2.cpp b/examples/skx-avx2.cpp new file mode 100644 index 00000000..ef4bc050 --- /dev/null +++ b/examples/skx-avx2.cpp @@ -0,0 +1,19 @@ +#include "x86simdsort-static-incl.h" + +int main() +{ + const int size = 1000; + double arrd[size]; + float arrf[size]; + x86simdsortStatic::qsort(arrf, size); + x86simdsortStatic::qsort(arrd, size); + x86simdsortStatic::qselect(arrf, 10, size); + x86simdsortStatic::qselect(arrd, 10, size); + x86simdsortStatic::partial_qsort(arrf, 10, size); + x86simdsortStatic::partial_qsort(arrd, 10, size); + auto arg1 = x86simdsortStatic::argsort(arrf, size); + auto arg2 = x86simdsortStatic::argselect(arrf, 10, size); + auto arg3 = x86simdsortStatic::argsort(arrd, size); + auto arg4 = x86simdsortStatic::argselect(arrd, 10, size); + return 0; +} diff --git a/examples/spr-16bit.cpp b/examples/spr-16bit.cpp new file mode 100644 index 00000000..6fb4c3ab --- /dev/null +++ b/examples/spr-16bit.cpp @@ -0,0 +1,11 @@ +#include "x86simdsort-static-incl.h" + +int main() +{ + const int size = 1000; + _Float16 arr[size]; + x86simdsortStatic::qsort(arr, size); + x86simdsortStatic::qselect(arr, 10, size); + x86simdsortStatic::partial_qsort(arr, 10, size); + return 0; +} diff --git a/lib/x86simdsort-avx2.cpp b/lib/x86simdsort-avx2.cpp index e10fc164..2afc4d1d 100644 --- a/lib/x86simdsort-avx2.cpp +++ b/lib/x86simdsort-avx2.cpp @@ -1,39 +1,36 @@ // AVX2 specific routines: -#include "avx2-32bit-qsort.hpp" -#include "avx2-64bit-qsort.hpp" -#include "avx2-32bit-half.hpp" -#include "xss-common-argsort.h" +#include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" #define DEFINE_ALL_METHODS(type) \ template <> \ void qsort(type *arr, size_t arrsize, bool hasnan, bool descending) \ { \ - avx2_qsort(arr, arrsize, hasnan, descending); \ + x86simdsortStatic::qsort(arr, arrsize, hasnan, descending); \ } \ template <> \ void qselect( \ type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ - avx2_qselect(arr, k, arrsize, hasnan, descending); \ + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); \ } \ template <> \ void partial_qsort( \ type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ - avx2_partial_qsort(arr, k, arrsize, hasnan, descending); \ + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); \ } \ template <> \ std::vector argsort( \ type *arr, size_t arrsize, bool hasnan, bool descending) \ { \ - return avx2_argsort(arr, arrsize, hasnan, descending); \ + return x86simdsortStatic::argsort(arr, arrsize, hasnan, descending); \ } \ template <> \ std::vector argselect( \ type *arr, size_t k, size_t arrsize, bool hasnan) \ { \ - return avx2_argselect(arr, k, arrsize, hasnan); \ + return x86simdsortStatic::argselect(arr, k, arrsize, hasnan); \ } namespace xss { diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp index 20095369..eeb7b2bf 100644 --- a/lib/x86simdsort-icl.cpp +++ b/lib/x86simdsort-icl.cpp @@ -1,5 +1,5 @@ // ICL specific routines: -#include "avx512-16bit-qsort.hpp" +#include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" namespace xss { @@ -7,7 +7,7 @@ namespace avx512 { template <> void qsort(uint16_t *arr, size_t size, bool hasnan, bool descending) { - avx512_qsort(arr, size, hasnan, descending); + x86simdsortStatic::qsort(arr, size, hasnan, descending); } template <> void qselect(uint16_t *arr, @@ -16,7 +16,7 @@ namespace avx512 { bool hasnan, bool descending) { - avx512_qselect(arr, k, arrsize, hasnan, descending); + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); } template <> void partial_qsort(uint16_t *arr, @@ -25,12 +25,12 @@ namespace avx512 { bool hasnan, bool descending) { - avx512_partial_qsort(arr, k, arrsize, hasnan, descending); + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); } template <> void qsort(int16_t *arr, size_t size, bool hasnan, bool descending) { - avx512_qsort(arr, size, hasnan, descending); + x86simdsortStatic::qsort(arr, size, hasnan, descending); } template <> void qselect(int16_t *arr, @@ -39,7 +39,7 @@ namespace avx512 { bool hasnan, bool descending) { - avx512_qselect(arr, k, arrsize, hasnan, descending); + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); } template <> void partial_qsort(int16_t *arr, @@ -48,7 +48,7 @@ namespace avx512 { bool hasnan, bool descending) { - avx512_partial_qsort(arr, k, arrsize, hasnan, descending); + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); } } // namespace avx512 } // namespace xss diff --git a/lib/x86simdsort-skx.cpp b/lib/x86simdsort-skx.cpp index 8b154d4e..829dd7b8 100644 --- a/lib/x86simdsort-skx.cpp +++ b/lib/x86simdsort-skx.cpp @@ -1,71 +1,68 @@ // SKX specific routines: -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-keyvaluesort.hpp" -#include "avx512-64bit-argsort.hpp" -#include "avx512-64bit-qsort.hpp" +#include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" #define DEFINE_ALL_METHODS(type) \ template <> \ void qsort(type *arr, size_t arrsize, bool hasnan, bool descending) \ { \ - avx512_qsort(arr, arrsize, hasnan, descending); \ + x86simdsortStatic::qsort(arr, arrsize, hasnan, descending); \ } \ template <> \ void qselect( \ type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ - avx512_qselect(arr, k, arrsize, hasnan, descending); \ + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); \ } \ template <> \ void partial_qsort( \ type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ - avx512_partial_qsort(arr, k, arrsize, hasnan, descending); \ + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); \ } \ template <> \ std::vector argsort( \ type *arr, size_t arrsize, bool hasnan, bool descending) \ { \ - return avx512_argsort(arr, arrsize, hasnan, descending); \ + return x86simdsortStatic::argsort(arr, arrsize, hasnan, descending); \ } \ template <> \ std::vector argselect( \ type *arr, size_t k, size_t arrsize, bool hasnan) \ { \ - return avx512_argselect(arr, k, arrsize, hasnan); \ + return x86simdsortStatic::argselect(arr, k, arrsize, hasnan); \ } #define DEFINE_KEYVALUE_METHODS(type) \ template <> \ void keyvalue_qsort(type *key, uint64_t *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } \ template <> \ void keyvalue_qsort(type *key, int64_t *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } \ template <> \ void keyvalue_qsort(type *key, double *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } \ template <> \ void keyvalue_qsort(type *key, uint32_t *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } \ template <> \ void keyvalue_qsort(type *key, int32_t *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } \ template <> \ void keyvalue_qsort(type *key, float *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } namespace xss { diff --git a/lib/x86simdsort-spr.cpp b/lib/x86simdsort-spr.cpp index b09a8393..b8069d2b 100644 --- a/lib/x86simdsort-spr.cpp +++ b/lib/x86simdsort-spr.cpp @@ -1,5 +1,5 @@ // SPR specific routines: -#include "avx512fp16-16bit-qsort.hpp" +#include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" namespace xss { @@ -7,10 +7,7 @@ namespace avx512 { template <> void qsort(_Float16 *arr, size_t size, bool hasnan, bool descending) { - if (descending) { avx512_qsort(arr, size, hasnan); } - else { - avx512_qsort(arr, size, hasnan); - } + x86simdsortStatic::qsort(arr, size, hasnan, descending); } template <> void qselect(_Float16 *arr, @@ -19,10 +16,7 @@ namespace avx512 { bool hasnan, bool descending) { - if (descending) { avx512_qselect(arr, k, arrsize, hasnan); } - else { - avx512_qselect(arr, k, arrsize, hasnan); - } + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); } template <> void partial_qsort(_Float16 *arr, @@ -31,10 +25,7 @@ namespace avx512 { bool hasnan, bool descending) { - if (descending) { avx512_partial_qsort(arr, k, arrsize, hasnan); } - else { - avx512_partial_qsort(arr, k, arrsize, hasnan); - } + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); } } // namespace avx512 } // namespace xss diff --git a/meson.build b/meson.build index 873094ba..7ae934df 100644 --- a/meson.build +++ b/meson.build @@ -36,7 +36,7 @@ cancompilefp16 = cpp.compiles(fp16code, args:'-march=sapphirerapids') subdir('lib') libsimdsort = shared_library('x86simdsortcpp', 'lib/x86simdsort.cpp', - include_directories : [utils, lib], + include_directories : [src, utils, lib], link_with : [libtargets], gnu_symbol_visibility : 'inlineshidden', install : true, diff --git a/src/avx2-32bit-half.hpp b/src/avx2-32bit-half.hpp index 52697692..9100cbbc 100644 --- a/src/avx2-32bit-half.hpp +++ b/src/avx2-32bit-half.hpp @@ -7,7 +7,6 @@ #ifndef AVX2_HALF_32BIT #define AVX2_HALF_32BIT -#include "xss-common-includes.h" #include "avx2-emu-funcs.hpp" /* diff --git a/src/avx2-32bit-qsort.hpp b/src/avx2-32bit-qsort.hpp index ad4e99fc..b8cca7a4 100644 --- a/src/avx2-32bit-qsort.hpp +++ b/src/avx2-32bit-qsort.hpp @@ -7,7 +7,6 @@ #ifndef AVX2_QSORT_32BIT #define AVX2_QSORT_32BIT -#include "xss-common-qsort.h" #include "avx2-emu-funcs.hpp" /* diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp index c633b4b9..32e5e385 100644 --- a/src/avx2-64bit-qsort.hpp +++ b/src/avx2-64bit-qsort.hpp @@ -8,7 +8,6 @@ #ifndef AVX2_QSORT_64BIT #define AVX2_QSORT_64BIT -#include "xss-common-qsort.h" #include "avx2-emu-funcs.hpp" /* diff --git a/src/avx2-emu-funcs.hpp b/src/avx2-emu-funcs.hpp index 38489626..a30da7cc 100644 --- a/src/avx2-emu-funcs.hpp +++ b/src/avx2-emu-funcs.hpp @@ -3,7 +3,6 @@ #include #include -#include "xss-common-qsort.h" constexpr auto avx2_mask_helper_lut32 = [] { std::array, 256> lut {}; diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h index 28c1c1fe..76db872e 100644 --- a/src/avx512-16bit-common.h +++ b/src/avx512-16bit-common.h @@ -7,8 +7,6 @@ #ifndef AVX512_16BIT_COMMON #define AVX512_16BIT_COMMON -#include "xss-common-qsort.h" - /* * Constants used in sorting 32 elements in a ZMM registers. Based on Bitonic * sorting network (see diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 15c7c91e..18595ac1 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -560,10 +560,11 @@ X86_SIMD_SORT_INLINE_ONLY bool is_a_nan(uint16_t elem) return ((elem & 0x7c00u) == 0x7c00u) && ((elem & 0x03ffu) != 0); } -X86_SIMD_SORT_INLINE void avx512_qsort_fp16(uint16_t *arr, - arrsize_t arrsize, - bool hasnan = false, - bool descending = false) +[[maybe_unused]] X86_SIMD_SORT_INLINE void +avx512_qsort_fp16(uint16_t *arr, + arrsize_t arrsize, + bool hasnan = false, + bool descending = false) { using vtype = zmm_vector; @@ -585,11 +586,12 @@ X86_SIMD_SORT_INLINE void avx512_qsort_fp16(uint16_t *arr, } } -X86_SIMD_SORT_INLINE void avx512_qselect_fp16(uint16_t *arr, - arrsize_t k, - arrsize_t arrsize, - bool hasnan = false, - bool descending = false) +[[maybe_unused]] X86_SIMD_SORT_INLINE void +avx512_qselect_fp16(uint16_t *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false, + bool descending = false) { using vtype = zmm_vector; @@ -617,11 +619,12 @@ X86_SIMD_SORT_INLINE void avx512_qselect_fp16(uint16_t *arr, } } -X86_SIMD_SORT_INLINE void avx512_partial_qsort_fp16(uint16_t *arr, - arrsize_t k, - arrsize_t arrsize, - bool hasnan = false, - bool descending = false) +[[maybe_unused]] X86_SIMD_SORT_INLINE void +avx512_partial_qsort_fp16(uint16_t *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false, + bool descending = false) { avx512_qselect_fp16(arr, k - 1, arrsize, hasnan, descending); avx512_qsort_fp16(arr, k - 1, descending); diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp index 8b44e76e..eeaba51c 100644 --- a/src/avx512-32bit-qsort.hpp +++ b/src/avx512-32bit-qsort.hpp @@ -8,8 +8,6 @@ #ifndef AVX512_QSORT_32BIT #define AVX512_QSORT_32BIT -#include "xss-common-qsort.h" - /* * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic * sorting network (see diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 68735c33..8f5fdce9 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -7,7 +7,6 @@ #ifndef AVX512_64BIT_COMMON #define AVX512_64BIT_COMMON -#include "xss-common-includes.h" #include "avx2-32bit-qsort.hpp" /* diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp index 9736b065..61046cae 100644 --- a/src/avx512-64bit-keyvaluesort.hpp +++ b/src/avx512-64bit-keyvaluesort.hpp @@ -8,7 +8,6 @@ #ifndef AVX512_QSORT_64BIT_KV #define AVX512_QSORT_64BIT_KV -#include "xss-common-qsort.h" #include "avx512-64bit-common.h" #include "xss-network-keyvaluesort.hpp" @@ -419,7 +418,7 @@ avx512_qsort_kv(T1 *keys, T2 *indexes, arrsize_t arrsize, bool hasnan = false) zmm_vector>::type; if (arrsize > 1) { - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { arrsize_t nan_count = 0; if (UNLIKELY(hasnan)) { nan_count = replace_nan_with_inf>(keys, arrsize); diff --git a/src/avx512-64bit-qsort.hpp b/src/avx512-64bit-qsort.hpp index 4dcaeafa..1d15ef55 100644 --- a/src/avx512-64bit-qsort.hpp +++ b/src/avx512-64bit-qsort.hpp @@ -7,7 +7,6 @@ #ifndef AVX512_QSORT_64BIT #define AVX512_QSORT_64BIT -#include "xss-common-qsort.h" #include "avx512-64bit-common.h" #endif // AVX512_QSORT_64BIT diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 130e28a8..6d2cca6b 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -200,65 +200,4 @@ X86_SIMD_SORT_INLINE_ONLY void replace_inf_with_nan(_Float16 *arr, } } } -/* Specialized template function for _Float16 qsort_*/ -template -X86_SIMD_SORT_INLINE_ONLY void -avx512_qsort(_Float16 *arr, arrsize_t arrsize, bool hasnan) -{ - using vtype = zmm_vector<_Float16>; - using comparator = - typename std::conditional, - Comparator>::type; - - if (arrsize > 1) { - arrsize_t nan_count = 0; - if (UNLIKELY(hasnan)) { - nan_count = replace_nan_with_inf(arr, arrsize); - } - - qsort_( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); - - replace_inf_with_nan(arr, arrsize, nan_count, descending); - } -} - -template -X86_SIMD_SORT_INLINE_ONLY void -avx512_qselect(_Float16 *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) -{ - using vtype = zmm_vector<_Float16>; - using comparator = - typename std::conditional, - Comparator>::type; - - arrsize_t index_first_elem = 0; - arrsize_t index_last_elem = arrsize - 1; - - if (UNLIKELY(hasnan)) { - if constexpr (descending) { - index_first_elem = move_nans_to_start_of_array(arr, arrsize); - } - else { - index_last_elem = move_nans_to_end_of_array(arr, arrsize); - } - } - - if (index_first_elem <= k && index_last_elem >= k) { - qselect_(arr, - k, - index_first_elem, - index_last_elem, - 2 * (arrsize_t)log2(arrsize)); - } -} -template -X86_SIMD_SORT_INLINE_ONLY void -avx512_partial_qsort(_Float16 *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) -{ - avx512_qselect(arr, k - 1, arrsize, hasnan); - avx512_qsort(arr, k - 1, hasnan); -} #endif // AVX512FP16_QSORT_16BIT diff --git a/src/x86simdsort-static-incl.h b/src/x86simdsort-static-incl.h new file mode 100644 index 00000000..0d0e4400 --- /dev/null +++ b/src/x86simdsort-static-incl.h @@ -0,0 +1,151 @@ +#ifndef X86_SIMD_SORT_STATIC_METHODS +#define X86_SIMD_SORT_STATIC_METHODS +#include +#include +#include "xss-common-includes.h" + +// Supported methods declared here for a quick reference: +namespace x86simdsortStatic { +template +X86_SIMD_SORT_FINLINE void +qsort(T *arr, size_t size, bool hasnan = false, bool descending = false); + +template +X86_SIMD_SORT_FINLINE void qselect(T *arr, + size_t k, + size_t size, + bool hasnan = false, + bool descending = false); + +template +X86_SIMD_SORT_FINLINE void partial_qsort(T *arr, + size_t k, + size_t size, + bool hasnan = false, + bool descending = false); + +template +X86_SIMD_SORT_FINLINE std::vector +argsort(T *arr, size_t size, bool hasnan = false, bool descending = false); + +/* argsort API required by NumPy: */ +template +X86_SIMD_SORT_FINLINE void argsort(T *arr, + size_t *arg, + size_t size, + bool hasnan = false, + bool descending = false); + +template +X86_SIMD_SORT_FINLINE std::vector +argselect(T *arr, size_t k, size_t size, bool hasnan = false); + +/* argselect API required by NumPy: */ +template +void X86_SIMD_SORT_FINLINE +argselect(T *arr, size_t *arg, size_t k, size_t size, bool hasnan = false); + +template +X86_SIMD_SORT_FINLINE void +keyvalue_qsort(T1 *key, T2 *val, size_t size, bool hasnan = false); + +} // namespace x86simdsortStatic + +#define XSS_METHODS(ISA) \ + template \ + X86_SIMD_SORT_FINLINE void x86simdsortStatic::qsort( \ + T *arr, size_t size, bool hasnan, bool descending) \ + { \ + ISA##_qsort(arr, size, hasnan, descending); \ + } \ + template \ + X86_SIMD_SORT_FINLINE void x86simdsortStatic::qselect( \ + T *arr, size_t k, size_t size, bool hasnan, bool descending) \ + { \ + ISA##_qselect(arr, k, size, hasnan, descending); \ + } \ + template \ + X86_SIMD_SORT_FINLINE void x86simdsortStatic::partial_qsort( \ + T *arr, size_t k, size_t size, bool hasnan, bool descending) \ + { \ + ISA##_partial_qsort(arr, k, size, hasnan, descending); \ + } \ + template \ + X86_SIMD_SORT_FINLINE void x86simdsortStatic::argsort( \ + T *arr, size_t *arg, size_t size, bool hasnan, bool descending) \ + { \ + ISA##_argsort(arr, arg, size, hasnan, descending); \ + } \ + template \ + X86_SIMD_SORT_FINLINE std::vector x86simdsortStatic::argsort( \ + T *arr, size_t size, bool hasnan, bool descending) \ + { \ + std::vector indices(size); \ + std::iota(indices.begin(), indices.end(), 0); \ + x86simdsortStatic::argsort( \ + arr, indices.data(), size, hasnan, descending); \ + return indices; \ + } \ + template \ + X86_SIMD_SORT_FINLINE void x86simdsortStatic::argselect( \ + T *arr, size_t *arg, size_t k, size_t size, bool hasnan) \ + { \ + ISA##_argselect(arr, arg, k, size, hasnan); \ + } \ + template \ + X86_SIMD_SORT_FINLINE std::vector x86simdsortStatic::argselect( \ + T *arr, size_t k, size_t size, bool hasnan) \ + { \ + std::vector indices(size); \ + std::iota(indices.begin(), indices.end(), 0); \ + x86simdsortStatic::argselect(arr, indices.data(), k, size, hasnan); \ + return indices; \ + } + +/* + * qsort, qselect, partial, argsort key-value sort template functions. + */ +#include "xss-common-qsort.h" +#include "xss-common-argsort.h" + +#if defined(__AVX512DQ__) && defined(__AVX512VL__) +/* 32-bit and 64-bit dtypes vector definitions on SKX */ +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-qsort.hpp" +#include "avx512-64bit-argsort.hpp" +#include "avx512-64bit-keyvaluesort.hpp" + +/* 16-bit dtypes vector definitions on ICL */ +#if defined(__AVX512BW__) && defined(__AVX512VBMI2__) +#include "avx512-16bit-qsort.hpp" +/* _Float16 vector definition on SPR*/ +#if defined(__FLT16_MAX__) && defined(__AVX512BW__) && defined(__AVX512FP16__) +#include "avx512fp16-16bit-qsort.hpp" +#endif // __FLT16_MAX__ +#endif // __AVX512VBMI2__ + +XSS_METHODS(avx512) + +// key-value currently only on avx512 +template +X86_SIMD_SORT_FINLINE void +x86simdsortStatic::keyvalue_qsort(T1 *key, T2 *val, size_t size, bool hasnan) +{ + avx512_qsort_kv(key, val, size, hasnan); +} + +#elif defined(__AVX512F__) +#error "x86simdsort requires AVX512DQ and AVX512VL to be enabled in addition to AVX512F to use AVX512" + +#elif defined(__AVX2__) && !defined(__AVX512F__) +/* 32-bit and 64-bit dtypes vector definitions on AVX2 */ +#include "avx2-32bit-half.hpp" +#include "avx2-32bit-qsort.hpp" +#include "avx2-64bit-qsort.hpp" +XSS_METHODS(avx2) + +#else +#error "x86simdsortStatic methods needs to be compiled with avx512/avx2 specific flags" +#endif // (__AVX512VL__ && __AVX512DQ__) || AVX2 + +#endif // X86_SIMD_SORT_STATIC_METHODS diff --git a/src/xss-common-argsort.h b/src/xss-common-argsort.h index b97dd0d0..4fa5041a 100644 --- a/src/xss-common-argsort.h +++ b/src/xss-common-argsort.h @@ -7,7 +7,6 @@ #ifndef XSS_COMMON_ARGSORT #define XSS_COMMON_ARGSORT -#include "xss-common-qsort.h" #include "xss-network-keyvaluesort.hpp" #include @@ -558,7 +557,7 @@ X86_SIMD_SORT_INLINE void avx512_argsort(T *arr, zmm_vector>::type; if (arrsize > 1) { - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if ((hasnan) && (array_has_nan(arr, arrsize))) { std_argsort_withnan(arr, arg, 0, arrsize); @@ -575,16 +574,6 @@ X86_SIMD_SORT_INLINE void avx512_argsort(T *arr, } } -template -X86_SIMD_SORT_INLINE std::vector avx512_argsort( - T *arr, arrsize_t arrsize, bool hasnan = false, bool descending = false) -{ - std::vector indices(arrsize); - std::iota(indices.begin(), indices.end(), 0); - avx512_argsort(arr, indices.data(), arrsize, hasnan, descending); - return indices; -} - /* argsort methods for 32-bit and 64-bit dtypes */ template X86_SIMD_SORT_INLINE void avx2_argsort(T *arr, @@ -602,7 +591,7 @@ X86_SIMD_SORT_INLINE void avx2_argsort(T *arr, avx2_half_vector, avx2_vector>::type; if (arrsize > 1) { - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if ((hasnan) && (array_has_nan(arr, arrsize))) { std_argsort_withnan(arr, arg, 0, arrsize); @@ -619,16 +608,6 @@ X86_SIMD_SORT_INLINE void avx2_argsort(T *arr, } } -template -X86_SIMD_SORT_INLINE std::vector avx2_argsort( - T *arr, arrsize_t arrsize, bool hasnan = false, bool descending = false) -{ - std::vector indices(arrsize); - std::iota(indices.begin(), indices.end(), 0); - avx2_argsort(arr, indices.data(), arrsize, hasnan, descending); - return indices; -} - /* argselect methods for 32-bit and 64-bit dtypes */ template X86_SIMD_SORT_INLINE void avx512_argselect(T *arr, @@ -648,7 +627,7 @@ X86_SIMD_SORT_INLINE void avx512_argselect(T *arr, zmm_vector>::type; if (arrsize > 1) { - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if ((hasnan) && (array_has_nan(arr, arrsize))) { std_argselect_withnan(arr, arg, k, 0, arrsize); return; @@ -660,16 +639,6 @@ X86_SIMD_SORT_INLINE void avx512_argselect(T *arr, } } -template -X86_SIMD_SORT_INLINE std::vector -avx512_argselect(T *arr, arrsize_t k, arrsize_t arrsize, bool hasnan = false) -{ - std::vector indices(arrsize); - std::iota(indices.begin(), indices.end(), 0); - avx512_argselect(arr, indices.data(), k, arrsize, hasnan); - return indices; -} - /* argselect methods for 32-bit and 64-bit dtypes */ template X86_SIMD_SORT_INLINE void avx2_argselect(T *arr, @@ -688,7 +657,7 @@ X86_SIMD_SORT_INLINE void avx2_argselect(T *arr, avx2_vector>::type; if (arrsize > 1) { - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if ((hasnan) && (array_has_nan(arr, arrsize))) { std_argselect_withnan(arr, arg, k, 0, arrsize); return; @@ -699,15 +668,4 @@ X86_SIMD_SORT_INLINE void avx2_argselect(T *arr, arr, arg, k, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); } } - -template -X86_SIMD_SORT_INLINE std::vector -avx2_argselect(T *arr, arrsize_t k, arrsize_t arrsize, bool hasnan = false) -{ - std::vector indices(arrsize); - std::iota(indices.begin(), indices.end(), 0); - avx2_argselect(arr, indices.data(), k, arrsize, hasnan); - return indices; -} - #endif // XSS_COMMON_ARGSORT diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index 2682919e..83a54716 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -7,6 +7,7 @@ #include #include #include +#include "xss-custom-float.h" #define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() #define X86_SIMD_SORT_INFINITYF std::numeric_limits::infinity() diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 02522b50..2d5b4ea1 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -34,7 +34,6 @@ * [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 * */ -#include "xss-common-includes.h" #include "xss-pivot-selection.hpp" #include "xss-network-qsort.hpp" #include "xss-common-comparators.hpp" @@ -106,8 +105,8 @@ X86_SIMD_SORT_INLINE void replace_inf_with_nan(type_t *arr, { if (descending) { for (arrsize_t ii = 0; nan_count > 0; ++ii) { - if constexpr (std::is_floating_point_v) { - arr[ii] = std::numeric_limits::quiet_NaN(); + if constexpr (xss::fp::is_floating_point_v) { + arr[ii] = xss::fp::quiet_NaN(); } else { arr[ii] = 0xFFFF; @@ -117,8 +116,8 @@ X86_SIMD_SORT_INLINE void replace_inf_with_nan(type_t *arr, } else { for (arrsize_t ii = size - 1; nan_count > 0; --ii) { - if constexpr (std::is_floating_point_v) { - arr[ii] = std::numeric_limits::quiet_NaN(); + if constexpr (xss::fp::is_floating_point_v) { + arr[ii] = xss::fp::quiet_NaN(); } else { arr[ii] = 0xFFFF; @@ -620,7 +619,7 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) if (arrsize > 1) { arrsize_t nan_count = 0; - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if (UNLIKELY(hasnan)) { nan_count = replace_nan_with_inf(arr, arrsize); } @@ -647,7 +646,7 @@ xss_qselect(T *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) arrsize_t index_first_elem = 0; arrsize_t index_last_elem = arrsize - 1; - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if (UNLIKELY(hasnan)) { if constexpr (descending) { index_first_elem = move_nans_to_start_of_array(arr, arrsize); diff --git a/utils/custom-float.h b/src/xss-custom-float.h similarity index 96% rename from utils/custom-float.h rename to src/xss-custom-float.h index 5faaa9e8..5fd973a7 100644 --- a/utils/custom-float.h +++ b/src/xss-custom-float.h @@ -1,5 +1,5 @@ -#ifndef UTILS_FLOAT -#define UTILS_FLOAT +#ifndef XSS_CUSTOM_FLOAT +#define XSS_CUSTOM_FLOAT #include namespace xss { namespace fp { @@ -87,4 +87,4 @@ namespace fp { } // namespace fp } // namespace xss -#endif +#endif // XSS_CUSTOM_FLOAT diff --git a/src/xss-network-keyvaluesort.hpp b/src/xss-network-keyvaluesort.hpp index a20da171..bb9b9bcd 100644 --- a/src/xss-network-keyvaluesort.hpp +++ b/src/xss-network-keyvaluesort.hpp @@ -1,8 +1,6 @@ #ifndef XSS_KEYVALUE_NETWORKS #define XSS_KEYVALUE_NETWORKS -#include "xss-common-includes.h" - template typename valueType::opmask_t resize_mask(typename keyType::opmask_t mask) { diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp index dd299507..0c1d1d8a 100644 --- a/src/xss-network-qsort.hpp +++ b/src/xss-network-qsort.hpp @@ -2,7 +2,6 @@ #define XSS_NETWORK_QSORT #include "xss-optimal-networks.hpp" -#include "xss-common-qsort.h" template X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); diff --git a/tests/meson.build b/tests/meson.build index 86ca2fe8..0583c55e 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -3,17 +3,17 @@ libtests = [] libtests += static_library('tests_qsort', files('test-qsort.cpp', ), dependencies: gtest_dep, - include_directories : [lib, utils], + include_directories : [src, lib, utils], ) libtests += static_library('tests_kvsort', files('test-keyvalue.cpp', ), dependencies: gtest_dep, - include_directories : [lib, utils], + include_directories : [src, lib, utils], ) libtests += static_library('tests_objsort', files('test-objqsort.cpp', ), dependencies: gtest_dep, - include_directories : [lib, utils], + include_directories : [src, lib, utils], ) diff --git a/utils/custom-compare.h b/utils/custom-compare.h index ab8df85c..6244bb24 100644 --- a/utils/custom-compare.h +++ b/utils/custom-compare.h @@ -1,6 +1,6 @@ #include #include -#include "custom-float.h" +#include "xss-custom-float.h" /* * Custom comparator class to handle NAN's: treats NAN > INF diff --git a/utils/rand_array.h b/utils/rand_array.h index a9703551..cb99da2e 100644 --- a/utils/rand_array.h +++ b/utils/rand_array.h @@ -10,7 +10,7 @@ #include #include #include -#include "custom-float.h" +#include "xss-custom-float.h" template static std::vector get_uniform_rand_array(int64_t arrsize,