11
11
#include " xss-common-qsort.h"
12
12
#include " xss-network-keyvaluesort.hpp"
13
13
14
+ #if defined(XSS_USE_OPENMP) && defined(_OPENMP)
15
+ #define XSS_COMPILE_OPENMP
16
+ #include < omp.h>
17
+ #endif
18
+
14
19
/*
15
20
* Parition one ZMM register based on the pivot and returns the index of the
16
21
* last element that is less than equal to the pivot.
@@ -366,7 +371,8 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
366
371
type2_t *indexes,
367
372
arrsize_t left,
368
373
arrsize_t right,
369
- int max_iters)
374
+ int max_iters,
375
+ arrsize_t task_threshold)
370
376
{
371
377
/*
372
378
* Resort to std::sort if quicksort isnt making any progress
@@ -391,14 +397,61 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
391
397
type1_t biggest = vtype1::type_min ();
392
398
arrsize_t pivot_index = kvpartition_unrolled<vtype1, vtype2, 4 >(
393
399
keys, indexes, left, right + 1 , pivot, &smallest, &biggest);
400
+
401
+ #ifdef XSS_COMPILE_OPENMP
402
+ if (pivot != smallest) {
403
+ bool parallel_left = (pivot_index - left) > task_threshold;
404
+ if (parallel_left) {
405
+ #pragma omp task
406
+ kvsort_<vtype1, vtype2>(keys,
407
+ indexes,
408
+ left,
409
+ pivot_index - 1 ,
410
+ max_iters - 1 ,
411
+ task_threshold);
412
+ }
413
+ else {
414
+ kvsort_<vtype1, vtype2>(keys,
415
+ indexes,
416
+ left,
417
+ pivot_index - 1 ,
418
+ max_iters - 1 ,
419
+ task_threshold);
420
+ }
421
+ }
422
+ if (pivot != biggest) {
423
+ bool parallel_right = (right - pivot_index) > task_threshold;
424
+
425
+ if (parallel_right) {
426
+ #pragma omp task
427
+ kvsort_<vtype1, vtype2>(keys,
428
+ indexes,
429
+ pivot_index,
430
+ right,
431
+ max_iters - 1 ,
432
+ task_threshold);
433
+ }
434
+ else {
435
+ kvsort_<vtype1, vtype2>(keys,
436
+ indexes,
437
+ pivot_index,
438
+ right,
439
+ max_iters - 1 ,
440
+ task_threshold);
441
+ }
442
+ }
443
+ #else
444
+ UNUSED (task_threshold);
445
+
394
446
if (pivot != smallest) {
395
447
kvsort_<vtype1, vtype2>(
396
- keys, indexes, left, pivot_index - 1 , max_iters - 1 );
448
+ keys, indexes, left, pivot_index - 1 , max_iters - 1 , 0 );
397
449
}
398
450
if (pivot != biggest) {
399
451
kvsort_<vtype1, vtype2>(
400
- keys, indexes, pivot_index, right, max_iters - 1 );
452
+ keys, indexes, pivot_index, right, max_iters - 1 , 0 );
401
453
}
454
+ #endif
402
455
}
403
456
404
457
template <typename vtype1,
@@ -486,7 +539,33 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv(
486
539
UNUSED (hasnan);
487
540
}
488
541
489
- kvsort_<keytype, valtype>(keys, indexes, 0 , arrsize - 1 , maxiters);
542
+ #ifdef XSS_COMPILE_OPENMP
543
+
544
+ bool use_parallel = arrsize > 10000 ;
545
+
546
+ if (use_parallel) {
547
+ // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
548
+ constexpr int thread_limit = 8 ;
549
+ int thread_count = std::min (thread_limit, omp_get_max_threads ());
550
+ arrsize_t task_threshold
551
+ = std::max ((arrsize_t )10000 , arrsize / 100 );
552
+
553
+ // We use omp parallel and then omp single to setup the threads that will run the omp task calls in kvsort_
554
+ // The omp single prevents multiple threads from running the initial kvsort_ simultaneously and causing problems
555
+ // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
556
+ #pragma omp parallel num_threads(thread_count)
557
+ #pragma omp single
558
+ kvsort_<keytype, valtype>(
559
+ keys, indexes, 0 , arrsize - 1 , maxiters, task_threshold);
560
+ }
561
+ else {
562
+ kvsort_<keytype, valtype>(
563
+ keys, indexes, 0 , arrsize - 1 , maxiters, 0 );
564
+ }
565
+ #else
566
+ kvsort_<keytype, valtype>(keys, indexes, 0 , arrsize - 1 , maxiters, 0 );
567
+ #endif
568
+
490
569
replace_inf_with_nan (keys, arrsize, nan_count);
491
570
492
571
if (descending) {
0 commit comments