From 71a9cfaa5bd3515cb80706923c61c18ba9c0d2f2 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Wed, 22 May 2024 11:01:12 -0700 Subject: [PATCH 1/3] Adds OpenMP supports for kv-sort --- Makefile | 6 +++- lib/meson.build | 10 +++++-- meson.build | 1 + meson_options.txt | 6 ++-- scripts/branch-compare.sh | 2 +- src/xss-common-keyvaluesort.hpp | 52 ++++++++++++++++++++++++++++++--- 6 files changed, 67 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 39010ea2..e00a57d7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,9 @@ test: - meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir + meson setup -Dbuild_tests=true -Duse_openmp=false --warnlevel 2 --werror --buildtype release builddir + cd builddir && ninja + +test_openmp: + meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir cd builddir && ninja bench: diff --git a/lib/meson.build b/lib/meson.build index 7850b97d..5cbc105f 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -1,12 +1,18 @@ libtargets = [] +# Add compile flags for OpenMP if enabled +openmpflags = [] +if get_option('use_openmp') + openmpflags = ['-DXSS_USE_OPENMP=true', '-fopenmp'] +endif + if cpp.has_argument('-march=haswell') libtargets += static_library('libavx', files( 'x86simdsort-avx2.cpp', ), include_directories : [src], - cpp_args : ['-march=haswell'], + cpp_args : ['-march=haswell', openmpflags], gnu_symbol_visibility : 'inlineshidden', ) endif @@ -17,7 +23,7 @@ if cpp.has_argument('-march=skylake-avx512') 'x86simdsort-skx.cpp', ), include_directories : [src], - cpp_args : ['-march=skylake-avx512'], + cpp_args : ['-march=skylake-avx512', openmpflags], gnu_symbol_visibility : 'inlineshidden', ) endif diff --git a/meson.build b/meson.build index 235ad5fe..b954eabd 100644 --- a/meson.build +++ b/meson.build @@ -37,6 +37,7 @@ subdir('lib') libsimdsort = shared_library('x86simdsortcpp', 'lib/x86simdsort.cpp', include_directories : [src, utils, lib], + link_args : [openmpflags], link_with : [libtargets], gnu_symbol_visibility : 'inlineshidden', install : true, diff --git a/meson_options.txt b/meson_options.txt index 6bf19447..b8d4fbf2 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -4,5 +4,7 @@ option('build_benchmarks', type : 'boolean', value : false, description : 'Build benchmarking suite (default: "false").') option('build_ippbench', type : 'boolean', value : false, description : 'Add IPP sort to benchmarks (default: "false").') -option('build_vqsortbench', type : 'boolean', value : false, - description : 'Add google vqsort to benchmarks (default: "false").') +option('build_vqsortbench', type : 'boolean', value : true, + description : 'Add google vqsort to benchmarks (default: "true").') +option('use_openmp', type : 'boolean', value : false, + description : 'Use OpenMP to accelerate key-value sort (default: "false").') diff --git a/scripts/branch-compare.sh b/scripts/branch-compare.sh index 8b36d98c..0d5057f5 100755 --- a/scripts/branch-compare.sh +++ b/scripts/branch-compare.sh @@ -27,7 +27,7 @@ build_branch() { fi fi cd $dir_name - meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir + meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir cd builddir ninja cd ../../ diff --git a/src/xss-common-keyvaluesort.hpp b/src/xss-common-keyvaluesort.hpp index 2615aad8..6595563a 100644 --- a/src/xss-common-keyvaluesort.hpp +++ b/src/xss-common-keyvaluesort.hpp @@ -366,7 +366,8 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys, type2_t *indexes, arrsize_t left, arrsize_t right, - int max_iters) + int max_iters, + arrsize_t task_threshold) { /* * Resort to std::sort if quicksort isnt making any progress @@ -391,14 +392,44 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys, type1_t biggest = vtype1::type_min(); arrsize_t pivot_index = kvpartition_unrolled( keys, indexes, left, right + 1, pivot, &smallest, &biggest); + + +#if defined(XSS_USE_OPENMP) && defined(_OPENMP) + if (pivot != smallest) { + bool parallelLeft = (pivot_index - left) > task_threshold; + if (parallelLeft){ + #pragma omp task if(parallelLeft) + kvsort_( + keys, indexes, left, pivot_index - 1, max_iters - 1, task_threshold); + }else{ + kvsort_( + keys, indexes, left, pivot_index - 1, max_iters - 1, task_threshold); + } + } + if (pivot != biggest) { + bool parallelRight = (right - pivot_index) > task_threshold; + + if (parallelRight){ + #pragma omp task if(parallelRight) + kvsort_( + keys, indexes, pivot_index, right, max_iters - 1, task_threshold); + }else{ + kvsort_( + keys, indexes, pivot_index, right, max_iters - 1, task_threshold); + } + } +#else + UNUSED(task_threshold); + if (pivot != smallest) { kvsort_( - keys, indexes, left, pivot_index - 1, max_iters - 1); + keys, indexes, left, pivot_index - 1, max_iters - 1, 0); } if (pivot != biggest) { kvsort_( - keys, indexes, pivot_index, right, max_iters - 1); + keys, indexes, pivot_index, right, max_iters - 1, 0); } +#endif } template (keys, indexes, 0, arrsize - 1, maxiters); +#if defined(XSS_USE_OPENMP) && defined(_OPENMP) + bool useParallel = arrsize > 10000; + arrsize_t taskThreshold = std::max((arrsize_t) 10000, arrsize / 100); + if (useParallel){ + #pragma omp parallel + #pragma omp single + kvsort_(keys, indexes, 0, arrsize - 1, maxiters, taskThreshold); + }else{ + kvsort_(keys, indexes, 0, arrsize - 1, maxiters, taskThreshold); + } +#else + kvsort_(keys, indexes, 0, arrsize - 1, maxiters, 0); +#endif + replace_inf_with_nan(keys, arrsize, nan_count); if (descending) { From 5c81fb17e1c04b7f693549acb738811bb2735001 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Wed, 22 May 2024 11:48:21 -0700 Subject: [PATCH 2/3] Adds a GitHub CI step for OpenMP and some formatting changes --- .github/workflows/c-cpp.yml | 32 ++++++++++++++ src/xss-common-keyvaluesort.hpp | 76 +++++++++++++++++++++------------ 2 files changed, 80 insertions(+), 28 deletions(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index b4854e98..85b73382 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -135,6 +135,38 @@ jobs: - name: Run test suite on SPR run: sde -spr -- ./builddir/testexe + SKX-SKL-openmp: + + runs-on: intel-ubuntu-latest + + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: Install dependencies + run: | + sudo apt update + sudo apt -y install g++-10 libgtest-dev meson curl git + + - name: Install Intel SDE + run: | + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz + mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ + sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde + + - name: Build + env: + CXX: g++-10 + run: | + make clean + meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir + cd builddir + ninja + + - name: Run test suite on SKX and SKL + run: | + sde -skx -- ./builddir/testexe + sde -skl -- ./builddir/testexe + SPR-gcc13-special-cases: runs-on: intel-ubuntu-latest diff --git a/src/xss-common-keyvaluesort.hpp b/src/xss-common-keyvaluesort.hpp index 6595563a..96741822 100644 --- a/src/xss-common-keyvaluesort.hpp +++ b/src/xss-common-keyvaluesort.hpp @@ -393,29 +393,46 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys, arrsize_t pivot_index = kvpartition_unrolled( keys, indexes, left, right + 1, pivot, &smallest, &biggest); - #if defined(XSS_USE_OPENMP) && defined(_OPENMP) if (pivot != smallest) { - bool parallelLeft = (pivot_index - left) > task_threshold; - if (parallelLeft){ - #pragma omp task if(parallelLeft) - kvsort_( - keys, indexes, left, pivot_index - 1, max_iters - 1, task_threshold); - }else{ - kvsort_( - keys, indexes, left, pivot_index - 1, max_iters - 1, task_threshold); + bool parallel_left = (pivot_index - left) > task_threshold; + if (parallel_left) { +#pragma omp task + kvsort_(keys, + indexes, + left, + pivot_index - 1, + max_iters - 1, + task_threshold); + } + else { + kvsort_(keys, + indexes, + left, + pivot_index - 1, + max_iters - 1, + task_threshold); } } if (pivot != biggest) { - bool parallelRight = (right - pivot_index) > task_threshold; - - if (parallelRight){ - #pragma omp task if(parallelRight) - kvsort_( - keys, indexes, pivot_index, right, max_iters - 1, task_threshold); - }else{ - kvsort_( - keys, indexes, pivot_index, right, max_iters - 1, task_threshold); + bool parallel_right = (right - pivot_index) > task_threshold; + + if (parallel_right) { +#pragma omp task + kvsort_(keys, + indexes, + pivot_index, + right, + max_iters - 1, + task_threshold); + } + else { + kvsort_(keys, + indexes, + pivot_index, + right, + max_iters - 1, + task_threshold); } } #else @@ -518,19 +535,22 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv( } #if defined(XSS_USE_OPENMP) && defined(_OPENMP) - bool useParallel = arrsize > 10000; - arrsize_t taskThreshold = std::max((arrsize_t) 10000, arrsize / 100); - if (useParallel){ - #pragma omp parallel - #pragma omp single - kvsort_(keys, indexes, 0, arrsize - 1, maxiters, taskThreshold); - }else{ - kvsort_(keys, indexes, 0, arrsize - 1, maxiters, taskThreshold); + bool use_parallel = arrsize > 10000; + arrsize_t task_threshold = std::max((arrsize_t)10000, arrsize / 100); + if (use_parallel) { +#pragma omp parallel +#pragma omp single + kvsort_( + keys, indexes, 0, arrsize - 1, maxiters, task_threshold); + } + else { + kvsort_( + keys, indexes, 0, arrsize - 1, maxiters, task_threshold); } #else kvsort_(keys, indexes, 0, arrsize - 1, maxiters, 0); -#endif - +#endif + replace_inf_with_nan(keys, arrsize, nan_count); if (descending) { From 5224f603cb9c6fb8c83df24e008dbf9979ace619 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Tue, 28 May 2024 10:03:14 -0700 Subject: [PATCH 3/3] Some style changes and commenting, plus limits thread count to 8 by default --- src/xss-common-keyvaluesort.hpp | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/xss-common-keyvaluesort.hpp b/src/xss-common-keyvaluesort.hpp index 96741822..e6fecc8e 100644 --- a/src/xss-common-keyvaluesort.hpp +++ b/src/xss-common-keyvaluesort.hpp @@ -11,6 +11,11 @@ #include "xss-common-qsort.h" #include "xss-network-keyvaluesort.hpp" +#if defined(XSS_USE_OPENMP) && defined(_OPENMP) +#define XSS_COMPILE_OPENMP +#include +#endif + /* * Parition one ZMM register based on the pivot and returns the index of the * last element that is less than equal to the pivot. @@ -393,7 +398,7 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys, arrsize_t pivot_index = kvpartition_unrolled( keys, indexes, left, right + 1, pivot, &smallest, &biggest); -#if defined(XSS_USE_OPENMP) && defined(_OPENMP) +#ifdef XSS_COMPILE_OPENMP if (pivot != smallest) { bool parallel_left = (pivot_index - left) > task_threshold; if (parallel_left) { @@ -534,18 +539,28 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv( UNUSED(hasnan); } -#if defined(XSS_USE_OPENMP) && defined(_OPENMP) +#ifdef XSS_COMPILE_OPENMP + bool use_parallel = arrsize > 10000; - arrsize_t task_threshold = std::max((arrsize_t)10000, arrsize / 100); + if (use_parallel) { -#pragma omp parallel + // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system + constexpr int thread_limit = 8; + int thread_count = std::min(thread_limit, omp_get_max_threads()); + arrsize_t task_threshold + = std::max((arrsize_t)10000, arrsize / 100); + + // We use omp parallel and then omp single to setup the threads that will run the omp task calls in kvsort_ + // The omp single prevents multiple threads from running the initial kvsort_ simultaneously and causing problems + // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays +#pragma omp parallel num_threads(thread_count) #pragma omp single kvsort_( keys, indexes, 0, arrsize - 1, maxiters, task_threshold); } else { kvsort_( - keys, indexes, 0, arrsize - 1, maxiters, task_threshold); + keys, indexes, 0, arrsize - 1, maxiters, 0); } #else kvsort_(keys, indexes, 0, arrsize - 1, maxiters, 0);