Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,38 @@ jobs:
- name: Run test suite on SPR
run: sde -spr -- ./builddir/testexe

SKX-SKL-openmp:

runs-on: intel-ubuntu-latest

steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Install dependencies
run: |
sudo apt update
sudo apt -y install g++-10 libgtest-dev meson curl git

- name: Install Intel SDE
run: |
curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz
mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde

- name: Build
env:
CXX: g++-10
run: |
make clean
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
cd builddir
ninja

- name: Run test suite on SKX and SKL
run: |
sde -skx -- ./builddir/testexe
sde -skl -- ./builddir/testexe

SPR-gcc13-special-cases:

runs-on: intel-ubuntu-latest
Expand Down
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
test:
meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir
meson setup -Dbuild_tests=true -Duse_openmp=false --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

test_openmp:
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

bench:
Expand Down
10 changes: 8 additions & 2 deletions lib/meson.build
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
libtargets = []

# Add compile flags for OpenMP if enabled
openmpflags = []
if get_option('use_openmp')
openmpflags = ['-DXSS_USE_OPENMP=true', '-fopenmp']
endif

if cpp.has_argument('-march=haswell')
libtargets += static_library('libavx',
files(
'x86simdsort-avx2.cpp',
),
include_directories : [src],
cpp_args : ['-march=haswell'],
cpp_args : ['-march=haswell', openmpflags],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand All @@ -17,7 +23,7 @@ if cpp.has_argument('-march=skylake-avx512')
'x86simdsort-skx.cpp',
),
include_directories : [src],
cpp_args : ['-march=skylake-avx512'],
cpp_args : ['-march=skylake-avx512', openmpflags],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand Down
1 change: 1 addition & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ subdir('lib')
libsimdsort = shared_library('x86simdsortcpp',
'lib/x86simdsort.cpp',
include_directories : [src, utils, lib],
link_args : [openmpflags],
link_with : [libtargets],
gnu_symbol_visibility : 'inlineshidden',
install : true,
Expand Down
6 changes: 4 additions & 2 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,7 @@ option('build_benchmarks', type : 'boolean', value : false,
description : 'Build benchmarking suite (default: "false").')
option('build_ippbench', type : 'boolean', value : false,
description : 'Add IPP sort to benchmarks (default: "false").')
option('build_vqsortbench', type : 'boolean', value : false,
description : 'Add google vqsort to benchmarks (default: "false").')
option('build_vqsortbench', type : 'boolean', value : true,
description : 'Add google vqsort to benchmarks (default: "true").')
option('use_openmp', type : 'boolean', value : false,
description : 'Use OpenMP to accelerate key-value sort (default: "false").')
2 changes: 1 addition & 1 deletion scripts/branch-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ build_branch() {
fi
fi
cd $dir_name
meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir
meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir
cd builddir
ninja
cd ../../
Expand Down
87 changes: 83 additions & 4 deletions src/xss-common-keyvaluesort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
#include "xss-common-qsort.h"
#include "xss-network-keyvaluesort.hpp"

#if defined(XSS_USE_OPENMP) && defined(_OPENMP)
#define XSS_COMPILE_OPENMP
#include <omp.h>
#endif

/*
* Parition one ZMM register based on the pivot and returns the index of the
* last element that is less than equal to the pivot.
Expand Down Expand Up @@ -366,7 +371,8 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
type2_t *indexes,
arrsize_t left,
arrsize_t right,
int max_iters)
int max_iters,
arrsize_t task_threshold)
{
/*
* Resort to std::sort if quicksort isnt making any progress
Expand All @@ -391,14 +397,61 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
type1_t biggest = vtype1::type_min();
arrsize_t pivot_index = kvpartition_unrolled<vtype1, vtype2, 4>(
keys, indexes, left, right + 1, pivot, &smallest, &biggest);

#ifdef XSS_COMPILE_OPENMP
if (pivot != smallest) {
bool parallel_left = (pivot_index - left) > task_threshold;
if (parallel_left) {
#pragma omp task
kvsort_<vtype1, vtype2>(keys,
indexes,
left,
pivot_index - 1,
max_iters - 1,
task_threshold);
}
else {
kvsort_<vtype1, vtype2>(keys,
indexes,
left,
pivot_index - 1,
max_iters - 1,
task_threshold);
}
}
if (pivot != biggest) {
bool parallel_right = (right - pivot_index) > task_threshold;

if (parallel_right) {
#pragma omp task
kvsort_<vtype1, vtype2>(keys,
indexes,
pivot_index,
right,
max_iters - 1,
task_threshold);
}
else {
kvsort_<vtype1, vtype2>(keys,
indexes,
pivot_index,
right,
max_iters - 1,
task_threshold);
}
}
#else
UNUSED(task_threshold);

if (pivot != smallest) {
kvsort_<vtype1, vtype2>(
keys, indexes, left, pivot_index - 1, max_iters - 1);
keys, indexes, left, pivot_index - 1, max_iters - 1, 0);
}
if (pivot != biggest) {
kvsort_<vtype1, vtype2>(
keys, indexes, pivot_index, right, max_iters - 1);
keys, indexes, pivot_index, right, max_iters - 1, 0);
}
#endif
}

template <typename vtype1,
Expand Down Expand Up @@ -486,7 +539,33 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv(
UNUSED(hasnan);
}

kvsort_<keytype, valtype>(keys, indexes, 0, arrsize - 1, maxiters);
#ifdef XSS_COMPILE_OPENMP

bool use_parallel = arrsize > 10000;

if (use_parallel) {
// This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
constexpr int thread_limit = 8;
int thread_count = std::min(thread_limit, omp_get_max_threads());
arrsize_t task_threshold
= std::max((arrsize_t)10000, arrsize / 100);

// We use omp parallel and then omp single to setup the threads that will run the omp task calls in kvsort_
// The omp single prevents multiple threads from running the initial kvsort_ simultaneously and causing problems
// Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
#pragma omp parallel num_threads(thread_count)
#pragma omp single
kvsort_<keytype, valtype>(
keys, indexes, 0, arrsize - 1, maxiters, task_threshold);
}
else {
kvsort_<keytype, valtype>(
keys, indexes, 0, arrsize - 1, maxiters, 0);
}
#else
kvsort_<keytype, valtype>(keys, indexes, 0, arrsize - 1, maxiters, 0);
#endif

replace_inf_with_nan(keys, arrsize, nan_count);

if (descending) {
Expand Down