Skip to content

Commit d3acd51

Browse files
author
Raghuveer Devulapalli
authored
Merge pull request #153 from sterrettm2/kvsort-openmp
Adds OpenMP based parallelization to key-value sorting
2 parents 2315766 + 5224f60 commit d3acd51

File tree

7 files changed

+134
-10
lines changed

7 files changed

+134
-10
lines changed

.github/workflows/c-cpp.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,38 @@ jobs:
135135
- name: Run test suite on SPR
136136
run: sde -spr -- ./builddir/testexe
137137

138+
SKX-SKL-openmp:
139+
140+
runs-on: intel-ubuntu-latest
141+
142+
steps:
143+
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
144+
145+
- name: Install dependencies
146+
run: |
147+
sudo apt update
148+
sudo apt -y install g++-10 libgtest-dev meson curl git
149+
150+
- name: Install Intel SDE
151+
run: |
152+
curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz
153+
mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
154+
sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
155+
156+
- name: Build
157+
env:
158+
CXX: g++-10
159+
run: |
160+
make clean
161+
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
162+
cd builddir
163+
ninja
164+
165+
- name: Run test suite on SKX and SKL
166+
run: |
167+
sde -skx -- ./builddir/testexe
168+
sde -skl -- ./builddir/testexe
169+
138170
SPR-gcc13-special-cases:
139171

140172
runs-on: intel-ubuntu-latest

Makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
test:
2-
meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir
2+
meson setup -Dbuild_tests=true -Duse_openmp=false --warnlevel 2 --werror --buildtype release builddir
3+
cd builddir && ninja
4+
5+
test_openmp:
6+
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
37
cd builddir && ninja
48

59
bench:

lib/meson.build

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
libtargets = []
22

3+
# Add compile flags for OpenMP if enabled
4+
openmpflags = []
5+
if get_option('use_openmp')
6+
openmpflags = ['-DXSS_USE_OPENMP=true', '-fopenmp']
7+
endif
8+
39
if cpp.has_argument('-march=haswell')
410
libtargets += static_library('libavx',
511
files(
612
'x86simdsort-avx2.cpp',
713
),
814
include_directories : [src],
9-
cpp_args : ['-march=haswell'],
15+
cpp_args : ['-march=haswell', openmpflags],
1016
gnu_symbol_visibility : 'inlineshidden',
1117
)
1218
endif
@@ -17,7 +23,7 @@ if cpp.has_argument('-march=skylake-avx512')
1723
'x86simdsort-skx.cpp',
1824
),
1925
include_directories : [src],
20-
cpp_args : ['-march=skylake-avx512'],
26+
cpp_args : ['-march=skylake-avx512', openmpflags],
2127
gnu_symbol_visibility : 'inlineshidden',
2228
)
2329
endif

meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ subdir('lib')
3737
libsimdsort = shared_library('x86simdsortcpp',
3838
'lib/x86simdsort.cpp',
3939
include_directories : [src, utils, lib],
40+
link_args : [openmpflags],
4041
link_with : [libtargets],
4142
gnu_symbol_visibility : 'inlineshidden',
4243
install : true,

meson_options.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,7 @@ option('build_benchmarks', type : 'boolean', value : false,
44
description : 'Build benchmarking suite (default: "false").')
55
option('build_ippbench', type : 'boolean', value : false,
66
description : 'Add IPP sort to benchmarks (default: "false").')
7-
option('build_vqsortbench', type : 'boolean', value : false,
8-
description : 'Add google vqsort to benchmarks (default: "false").')
7+
option('build_vqsortbench', type : 'boolean', value : true,
8+
description : 'Add google vqsort to benchmarks (default: "true").')
9+
option('use_openmp', type : 'boolean', value : false,
10+
description : 'Use OpenMP to accelerate key-value sort (default: "false").')

scripts/branch-compare.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ build_branch() {
2727
fi
2828
fi
2929
cd $dir_name
30-
meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir
30+
meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir
3131
cd builddir
3232
ninja
3333
cd ../../

src/xss-common-keyvaluesort.hpp

Lines changed: 83 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@
1111
#include "xss-common-qsort.h"
1212
#include "xss-network-keyvaluesort.hpp"
1313

14+
#if defined(XSS_USE_OPENMP) && defined(_OPENMP)
15+
#define XSS_COMPILE_OPENMP
16+
#include <omp.h>
17+
#endif
18+
1419
/*
1520
* Parition one ZMM register based on the pivot and returns the index of the
1621
* last element that is less than equal to the pivot.
@@ -366,7 +371,8 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
366371
type2_t *indexes,
367372
arrsize_t left,
368373
arrsize_t right,
369-
int max_iters)
374+
int max_iters,
375+
arrsize_t task_threshold)
370376
{
371377
/*
372378
* Resort to std::sort if quicksort isnt making any progress
@@ -391,14 +397,61 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
391397
type1_t biggest = vtype1::type_min();
392398
arrsize_t pivot_index = kvpartition_unrolled<vtype1, vtype2, 4>(
393399
keys, indexes, left, right + 1, pivot, &smallest, &biggest);
400+
401+
#ifdef XSS_COMPILE_OPENMP
402+
if (pivot != smallest) {
403+
bool parallel_left = (pivot_index - left) > task_threshold;
404+
if (parallel_left) {
405+
#pragma omp task
406+
kvsort_<vtype1, vtype2>(keys,
407+
indexes,
408+
left,
409+
pivot_index - 1,
410+
max_iters - 1,
411+
task_threshold);
412+
}
413+
else {
414+
kvsort_<vtype1, vtype2>(keys,
415+
indexes,
416+
left,
417+
pivot_index - 1,
418+
max_iters - 1,
419+
task_threshold);
420+
}
421+
}
422+
if (pivot != biggest) {
423+
bool parallel_right = (right - pivot_index) > task_threshold;
424+
425+
if (parallel_right) {
426+
#pragma omp task
427+
kvsort_<vtype1, vtype2>(keys,
428+
indexes,
429+
pivot_index,
430+
right,
431+
max_iters - 1,
432+
task_threshold);
433+
}
434+
else {
435+
kvsort_<vtype1, vtype2>(keys,
436+
indexes,
437+
pivot_index,
438+
right,
439+
max_iters - 1,
440+
task_threshold);
441+
}
442+
}
443+
#else
444+
UNUSED(task_threshold);
445+
394446
if (pivot != smallest) {
395447
kvsort_<vtype1, vtype2>(
396-
keys, indexes, left, pivot_index - 1, max_iters - 1);
448+
keys, indexes, left, pivot_index - 1, max_iters - 1, 0);
397449
}
398450
if (pivot != biggest) {
399451
kvsort_<vtype1, vtype2>(
400-
keys, indexes, pivot_index, right, max_iters - 1);
452+
keys, indexes, pivot_index, right, max_iters - 1, 0);
401453
}
454+
#endif
402455
}
403456

404457
template <typename vtype1,
@@ -486,7 +539,33 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv(
486539
UNUSED(hasnan);
487540
}
488541

489-
kvsort_<keytype, valtype>(keys, indexes, 0, arrsize - 1, maxiters);
542+
#ifdef XSS_COMPILE_OPENMP
543+
544+
bool use_parallel = arrsize > 10000;
545+
546+
if (use_parallel) {
547+
// This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
548+
constexpr int thread_limit = 8;
549+
int thread_count = std::min(thread_limit, omp_get_max_threads());
550+
arrsize_t task_threshold
551+
= std::max((arrsize_t)10000, arrsize / 100);
552+
553+
// We use omp parallel and then omp single to setup the threads that will run the omp task calls in kvsort_
554+
// The omp single prevents multiple threads from running the initial kvsort_ simultaneously and causing problems
555+
// Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
556+
#pragma omp parallel num_threads(thread_count)
557+
#pragma omp single
558+
kvsort_<keytype, valtype>(
559+
keys, indexes, 0, arrsize - 1, maxiters, task_threshold);
560+
}
561+
else {
562+
kvsort_<keytype, valtype>(
563+
keys, indexes, 0, arrsize - 1, maxiters, 0);
564+
}
565+
#else
566+
kvsort_<keytype, valtype>(keys, indexes, 0, arrsize - 1, maxiters, 0);
567+
#endif
568+
490569
replace_inf_with_nan(keys, arrsize, nan_count);
491570

492571
if (descending) {

0 commit comments

Comments
 (0)