diff --git a/.github/workflows/build-numpy.yml b/.github/workflows/build-numpy.yml index 4ffa985c..4cff8ed8 100644 --- a/.github/workflows/build-numpy.yml +++ b/.github/workflows/build-numpy.yml @@ -69,4 +69,57 @@ jobs: python -c "import numpy; numpy.show_config()" && python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py + NumPy-SPR-baseline: + runs-on: intel-ubuntu-latest + + steps: + - name: Checkout x86-simd-sort + uses: actions/checkout@v3 + with: + fetch-depth: 0 + path: x86-simd-sort + + - name: Specify branch name + working-directory: ${{ github.workspace }}/x86-simd-sort + run: git switch -c pr-branch + + - name: Install build dependencies + run: | + sudo apt update + sudo apt -y install g++-12 gcc-12 git + + - name: Checkout NumPy main + uses: actions/checkout@v3 + with: + repository: numpy/numpy + submodules: recursive + fetch-depth: 0 + ref: main + path: numpy + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install NumPy dependencies + working-directory: ${{ github.workspace }}/numpy + run: | + pip install -r build_requirements.txt + pip install -r test_requirements.txt + + - name: Update x86-simd-sort + working-directory: ${{ github.workspace }}/numpy + run: | + cd numpy/_core/src/npysort/x86-simd-sort + git remote add temp ${{ github.workspace }}/x86-simd-sort + git fetch temp + git checkout temp/pr-branch + + - name: Build NumPy with cpu basline SPR + working-directory: ${{ github.workspace }}/numpy + env: + CXX: g++-12 + CC: gcc-12 + run: | + spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_spr diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 2821011e..a71281f4 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -465,7 +465,8 @@ struct zmm_vector { }; template <> -bool comparison_func>(const uint16_t &a, const uint16_t &b) +X86_SIMD_SORT_INLINE_ONLY bool +comparison_func>(const uint16_t &a, const uint16_t &b) { uint16_t signa = a & 0x8000, signb = b & 0x8000; uint16_t expa = a & 0x7c00, expb = b & 0x7c00; @@ -493,8 +494,8 @@ bool comparison_func>(const uint16_t &a, const uint16_t &b) } template <> -arrsize_t replace_nan_with_inf>(uint16_t *arr, - arrsize_t arrsize) +X86_SIMD_SORT_INLINE_ONLY arrsize_t +replace_nan_with_inf>(uint16_t *arr, arrsize_t arrsize) { arrsize_t nan_count = 0; __mmask16 loadmask = 0xFFFF; @@ -513,13 +514,13 @@ arrsize_t replace_nan_with_inf>(uint16_t *arr, } template <> -bool is_a_nan(uint16_t elem) +X86_SIMD_SORT_INLINE_ONLY bool is_a_nan(uint16_t elem) { return ((elem & 0x7c00u) == 0x7c00u) && ((elem & 0x03ffu) != 0); } -X86_SIMD_SORT_INLINE -void avx512_qsort_fp16(uint16_t *arr, arrsize_t arrsize, bool hasnan = false) +X86_SIMD_SORT_INLINE void +avx512_qsort_fp16(uint16_t *arr, arrsize_t arrsize, bool hasnan = false) { if (arrsize > 1) { arrsize_t nan_count = 0; @@ -533,11 +534,10 @@ void avx512_qsort_fp16(uint16_t *arr, arrsize_t arrsize, bool hasnan = false) } } -X86_SIMD_SORT_INLINE -void avx512_qselect_fp16(uint16_t *arr, - arrsize_t k, - arrsize_t arrsize, - bool hasnan = false) +X86_SIMD_SORT_INLINE void avx512_qselect_fp16(uint16_t *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false) { arrsize_t indx_last_elem = arrsize - 1; if (UNLIKELY(hasnan)) { @@ -549,11 +549,10 @@ void avx512_qselect_fp16(uint16_t *arr, } } -X86_SIMD_SORT_INLINE -void avx512_partial_qsort_fp16(uint16_t *arr, - arrsize_t k, - arrsize_t arrsize, - bool hasnan = false) +X86_SIMD_SORT_INLINE void avx512_partial_qsort_fp16(uint16_t *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false) { avx512_qselect_fp16(arr, k - 1, arrsize, hasnan); avx512_qsort_fp16(arr, k - 1); diff --git a/src/avx512-64bit-argsort.hpp b/src/avx512-64bit-argsort.hpp index c831b65d..c4084c68 100644 --- a/src/avx512-64bit-argsort.hpp +++ b/src/avx512-64bit-argsort.hpp @@ -657,9 +657,8 @@ avx512_argsort(T *arr, arrsize_t *arg, arrsize_t arrsize, bool hasnan = false) } template -X86_SIMD_SORT_INLINE std::vector avx512_argsort(T *arr, - arrsize_t arrsize, - bool hasnan = false) +X86_SIMD_SORT_INLINE std::vector +avx512_argsort(T *arr, arrsize_t arrsize, bool hasnan = false) { std::vector indices(arrsize); std::iota(indices.begin(), indices.end(), 0); @@ -669,8 +668,11 @@ X86_SIMD_SORT_INLINE std::vector avx512_argsort(T *arr, /* argselect methods for 32-bit and 64-bit dtypes */ template -X86_SIMD_SORT_INLINE void -avx512_argselect(T *arr, arrsize_t *arg, arrsize_t k, arrsize_t arrsize, bool hasnan = false) +X86_SIMD_SORT_INLINE void avx512_argselect(T *arr, + arrsize_t *arg, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false) { using vectype = typename std::conditional, diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 60004fc6..21958027 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -160,13 +160,14 @@ struct zmm_vector<_Float16> { }; template <> -bool is_a_nan<_Float16>(_Float16 elem) +X86_SIMD_SORT_INLINE_ONLY bool is_a_nan<_Float16>(_Float16 elem) { return elem != elem; } template <> -void replace_inf_with_nan(_Float16 *arr, arrsize_t size, arrsize_t nan_count) +X86_SIMD_SORT_INLINE_ONLY void +replace_inf_with_nan(_Float16 *arr, arrsize_t size, arrsize_t nan_count) { Fp16Bits val; val.i_ = 0x7c01; @@ -177,7 +178,8 @@ void replace_inf_with_nan(_Float16 *arr, arrsize_t size, arrsize_t nan_count) } /* Specialized template function for _Float16 qsort_*/ template <> -void avx512_qsort(_Float16 *arr, arrsize_t arrsize, bool hasnan) +X86_SIMD_SORT_INLINE_ONLY void +avx512_qsort(_Float16 *arr, arrsize_t arrsize, bool hasnan) { if (arrsize > 1) { arrsize_t nan_count = 0; @@ -192,7 +194,8 @@ void avx512_qsort(_Float16 *arr, arrsize_t arrsize, bool hasnan) } template <> -void avx512_qselect(_Float16 *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) +X86_SIMD_SORT_INLINE_ONLY void +avx512_qselect(_Float16 *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) { arrsize_t indx_last_elem = arrsize - 1; if (UNLIKELY(hasnan)) { @@ -204,10 +207,8 @@ void avx512_qselect(_Float16 *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) } } template <> -void avx512_partial_qsort(_Float16 *arr, - arrsize_t k, - arrsize_t arrsize, - bool hasnan) +X86_SIMD_SORT_INLINE_ONLY void +avx512_partial_qsort(_Float16 *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) { avx512_qselect(arr, k - 1, arrsize, hasnan); avx512_qsort(arr, k - 1, hasnan); diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index 23c9f964..c373ba54 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -38,6 +38,7 @@ /* Compiler specific macros specific */ #ifdef _MSC_VER +#define X86_SIMD_SORT_INLINE_ONLY inline #define X86_SIMD_SORT_INLINE static inline #define X86_SIMD_SORT_FINLINE static __forceinline #define LIKELY(x) (x) @@ -47,14 +48,17 @@ * Force inline in cygwin to work around a compiler bug. See * https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584 */ +#define X86_SIMD_SORT_INLINE_ONLY inline #define X86_SIMD_SORT_INLINE static __attribute__((always_inline)) #define X86_SIMD_SORT_FINLINE static __attribute__((always_inline)) #elif defined(__GNUC__) +#define X86_SIMD_SORT_INLINE_ONLY inline #define X86_SIMD_SORT_INLINE static inline #define X86_SIMD_SORT_FINLINE static inline __attribute__((always_inline)) #define LIKELY(x) __builtin_expect((x), 1) #define UNLIKELY(x) __builtin_expect((x), 0) #else +#define X86_SIMD_SORT_INLINE_ONLY #define X86_SIMD_SORT_INLINE static #define X86_SIMD_SORT_FINLINE static #define LIKELY(x) (x)