@@ -384,21 +384,25 @@ X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr,
384
384
const int64_t left,
385
385
const int64_t right)
386
386
{
387
- // median of 8
387
+ // median of 8x8 elements
388
388
int64_t size = (right - left) / 8 ;
389
389
using zmm_t = typename vtype::zmm_t ;
390
- __m512i rand_index = _mm512_set_epi64 (left + size,
391
- left + 2 * size,
392
- left + 3 * size,
393
- left + 4 * size,
394
- left + 5 * size,
395
- left + 6 * size,
396
- left + 7 * size,
397
- left + 8 * size);
398
- zmm_t rand_vec = vtype::template i64gather<sizeof (type_t )>(rand_index, arr);
390
+ zmm_t v[8 ];
391
+ for (int64_t ii = 0 ; ii < 8 ; ++ii) {
392
+ v[ii] = vtype::loadu (arr + left + ii*size);
393
+ }
394
+ COEX<vtype>(v[0 ], v[1 ]); COEX<vtype>(v[2 ], v[3 ]); /* step 1 */
395
+ COEX<vtype>(v[4 ], v[5 ]); COEX<vtype>(v[6 ], v[7 ]);
396
+ COEX<vtype>(v[0 ], v[2 ]); COEX<vtype>(v[1 ], v[3 ]); /* step 2 */
397
+ COEX<vtype>(v[4 ], v[6 ]); COEX<vtype>(v[5 ], v[7 ]);
398
+ COEX<vtype>(v[0 ], v[4 ]); COEX<vtype>(v[1 ], v[2 ]); /* step 3 */
399
+ COEX<vtype>(v[5 ], v[6 ]); COEX<vtype>(v[3 ], v[7 ]);
400
+ COEX<vtype>(v[1 ], v[5 ]); COEX<vtype>(v[2 ], v[6 ]); /* step 4 */
401
+ COEX<vtype>(v[3 ], v[5 ]); COEX<vtype>(v[2 ], v[4 ]); /* step 5 */
402
+ COEX<vtype>(v[3 ], v[4 ]); /* step 6 */
399
403
// pivot will never be a nan, since there are no nan's!
400
- zmm_t sort = sort_zmm_64bit<vtype>(rand_vec );
404
+ zmm_t sort = sort_zmm_64bit<vtype>(v[ 3 ] );
401
405
return ((type_t *)&sort)[4 ];
402
406
}
403
407
404
- #endif
408
+ #endif
0 commit comments