numpy · r-devulap · Dec 13, 2023 · Dec 12, 2023 · Dec 12, 2023 · Dec 12, 2023
diff --git a/.github/workflows/build-numpy.yml b/.github/workflows/build-numpy.yml
@@ -89,6 +89,12 @@ jobs:
         sudo apt update
         sudo apt -y install g++-12 gcc-12 git
 
+    - name: Install Intel SDE
+      run: |
+        curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/788820/sde-external-9.27.0-2023-09-13-lin.tar.xz
+        mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
+        sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
+
     - name: Checkout NumPy main
       uses: actions/checkout@v3
       with:
@@ -123,3 +129,21 @@ jobs:
         CC: gcc-12
       run: |
         spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_spr
+
+    - name: Run tests on TGL
+      working-directory: ${{ github.workspace }}/numpy
+      run: |
+        export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
+        export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
+        cd build-install &&
+        sde -tgl -- python -c "import numpy; numpy.show_config()" &&
+        sde -tgl -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py
+
+    - name: Run tests on SPR
+      working-directory: ${{ github.workspace }}/numpy
+      run: |
+        export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
+        export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
+        cd build-install &&
+        sde -spr -- python -c "import numpy; numpy.show_config()" &&
+        sde -spr -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py
diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp
@@ -499,16 +499,17 @@ replace_nan_with_inf<zmm_vector<float16>>(uint16_t *arr, arrsize_t arrsize)
 {
     arrsize_t nan_count = 0;
     __mmask16 loadmask = 0xFFFF;
-    while (arrsize > 0) {
-        if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; }
+    for (arrsize_t ii = 0; ii < arrsize; ii = ii + zmm_vector<float16>::numlanes / 2) {
+        if (arrsize - ii < 16) {
+            loadmask = (0x0001 << (arrsize-ii)) - 0x0001;
+        }
         __m256i in_zmm = _mm256_maskz_loadu_epi16(loadmask, arr);
         __m512 in_zmm_asfloat = _mm512_cvtph_ps(in_zmm);
         __mmask16 nanmask = _mm512_cmp_ps_mask(
                 in_zmm_asfloat, in_zmm_asfloat, _CMP_NEQ_UQ);
         nan_count += _mm_popcnt_u32((int32_t)nanmask);
         _mm256_mask_storeu_epi16(arr, nanmask, YMM_MAX_HALF);
         arr += 16;
-        arrsize -= 16;
     }
     return nan_count;
 }