|
11 | 11 | #include "xss-common-qsort.h"
|
12 | 12 | #include "xss-network-keyvaluesort.hpp"
|
13 | 13 |
|
| 14 | +#if defined(XSS_USE_OPENMP) && defined(_OPENMP) |
| 15 | +#define XSS_COMPILE_OPENMP |
| 16 | +#include <omp.h> |
| 17 | +#endif |
| 18 | + |
14 | 19 | /*
|
15 | 20 | * Parition one ZMM register based on the pivot and returns the index of the
|
16 | 21 | * last element that is less than equal to the pivot.
|
@@ -393,7 +398,7 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
|
393 | 398 | arrsize_t pivot_index = kvpartition_unrolled<vtype1, vtype2, 4>(
|
394 | 399 | keys, indexes, left, right + 1, pivot, &smallest, &biggest);
|
395 | 400 |
|
396 |
| -#if defined(XSS_USE_OPENMP) && defined(_OPENMP) |
| 401 | +#ifdef XSS_COMPILE_OPENMP |
397 | 402 | if (pivot != smallest) {
|
398 | 403 | bool parallel_left = (pivot_index - left) > task_threshold;
|
399 | 404 | if (parallel_left) {
|
@@ -534,18 +539,28 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv(
|
534 | 539 | UNUSED(hasnan);
|
535 | 540 | }
|
536 | 541 |
|
537 |
| -#if defined(XSS_USE_OPENMP) && defined(_OPENMP) |
| 542 | +#ifdef XSS_COMPILE_OPENMP |
| 543 | + |
538 | 544 | bool use_parallel = arrsize > 10000;
|
539 |
| - arrsize_t task_threshold = std::max((arrsize_t)10000, arrsize / 100); |
| 545 | + |
540 | 546 | if (use_parallel) {
|
541 |
| -#pragma omp parallel |
| 547 | + // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system |
| 548 | + constexpr int thread_limit = 8; |
| 549 | + int thread_count = std::min(thread_limit, omp_get_max_threads()); |
| 550 | + arrsize_t task_threshold |
| 551 | + = std::max((arrsize_t)10000, arrsize / 100); |
| 552 | + |
| 553 | + // We use omp parallel and then omp single to setup the threads that will run the omp task calls in kvsort_ |
| 554 | + // The omp single prevents multiple threads from running the initial kvsort_ simultaneously and causing problems |
| 555 | + // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays |
| 556 | +#pragma omp parallel num_threads(thread_count) |
542 | 557 | #pragma omp single
|
543 | 558 | kvsort_<keytype, valtype>(
|
544 | 559 | keys, indexes, 0, arrsize - 1, maxiters, task_threshold);
|
545 | 560 | }
|
546 | 561 | else {
|
547 | 562 | kvsort_<keytype, valtype>(
|
548 |
| - keys, indexes, 0, arrsize - 1, maxiters, task_threshold); |
| 563 | + keys, indexes, 0, arrsize - 1, maxiters, 0); |
549 | 564 | }
|
550 | 565 | #else
|
551 | 566 | kvsort_<keytype, valtype>(keys, indexes, 0, arrsize - 1, maxiters, 0);
|
|
0 commit comments