diff --git a/Makefile b/Makefile
index df55afbf..6169ec1b 100644
--- a/Makefile
+++ b/Makefile
@@ -38,4 +38,4 @@ meson:
 	cd builddir && ninja
 
 clean:
-	$(RM) -rf $(TESTDIR)/*.o $(UTILS)/*.o testexe benchexe builddir
+	$(RM) -rf $(TESTDIR)/*.o $(BENCHDIR)/*.o $(UTILS)/*.o testexe benchexe builddir
diff --git a/benchmarks/bench-qsort-common.h b/benchmarks/bench-qsort-common.h
new file mode 100644
index 00000000..fe0decf7
--- /dev/null
+++ b/benchmarks/bench-qsort-common.h
@@ -0,0 +1,11 @@
+#ifndef AVX512_BENCH_COMMON
+#define AVX512_BENCH_COMMON
+
+#include <benchmark/benchmark.h>
+#include "rand_array.h"
+#include "cpuinfo.h"
+#include "avx512-16bit-qsort.hpp"
+#include "avx512-32bit-qsort.hpp"
+#include "avx512-64bit-qsort.hpp"
+
+#endif
diff --git a/benchmarks/bench_partial_qsort.hpp b/benchmarks/bench_partial_qsort.hpp
new file mode 100644
index 00000000..d54ceb31
--- /dev/null
+++ b/benchmarks/bench_partial_qsort.hpp
@@ -0,0 +1,72 @@
+#include "bench-qsort-common.h"
+
+template <typename T>
+static void avx512_partial_qsort(benchmark::State& state) {
+    if (!cpu_has_avx512bw()) {
+        state.SkipWithMessage("Requires AVX512 BW ISA");
+    }
+    if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) {
+        state.SkipWithMessage("Requires AVX512 VBMI2 ISA");
+    }
+    // Perform setup here
+    int64_t K = state.range(0);
+    size_t ARRSIZE = 10000;
+    std::vector<T> arr;
+    std::vector<T> arr_bkp;
+
+    /* Initialize elements */
+    arr = get_uniform_rand_array<T>(ARRSIZE);
+    arr_bkp = arr;
+
+    /* call avx512_partial_qsort */
+    for (auto _ : state) {
+        avx512_partial_qsort<T>(arr.data(), K, ARRSIZE);
+
+        state.PauseTiming();
+        arr = arr_bkp;
+        state.ResumeTiming();
+    }
+}
+
+template <typename T>
+static void stdpartialsort(benchmark::State& state) {
+    // Perform setup here
+    int64_t K = state.range(0);
+    size_t ARRSIZE = 10000;
+    std::vector<T> arr;
+    std::vector<T> arr_bkp;
+
+    /* Initialize elements */
+    arr = get_uniform_rand_array<T>(ARRSIZE);
+    arr_bkp = arr;
+
+    /* call std::partial_sort */
+    for (auto _ : state) {
+        std::partial_sort(arr.begin(), arr.begin() + K, arr.end());
+
+        state.PauseTiming();
+        arr = arr_bkp;
+        state.ResumeTiming();
+    }
+}
+
+// Register the function as a benchmark
+BENCHMARK(avx512_partial_qsort<float>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdpartialsort<float>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_partial_qsort<uint32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdpartialsort<uint32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_partial_qsort<int32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdpartialsort<int32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+
+BENCHMARK(avx512_partial_qsort<double>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdpartialsort<double>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_partial_qsort<uint64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdpartialsort<uint64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_partial_qsort<int64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdpartialsort<int64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+
+//BENCHMARK(avx512_partial_qsort<float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_partial_qsort<uint16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdpartialsort<uint16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_partial_qsort<int16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdpartialsort<int16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
diff --git a/benchmarks/bench_qselect.hpp b/benchmarks/bench_qselect.hpp
new file mode 100644
index 00000000..fea5bea4
--- /dev/null
+++ b/benchmarks/bench_qselect.hpp
@@ -0,0 +1,72 @@
+#include "bench-qsort-common.h"
+
+template <typename T>
+static void avx512_qselect(benchmark::State& state) {
+    if (!cpu_has_avx512bw()) {
+        state.SkipWithMessage("Requires AVX512 BW ISA");
+    }
+    if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) {
+        state.SkipWithMessage("Requires AVX512 VBMI2 ISA");
+    }
+    // Perform setup here
+    int64_t K = state.range(0);
+    size_t ARRSIZE = 10000;
+    std::vector<T> arr;
+    std::vector<T> arr_bkp;
+
+    /* Initialize elements */
+    arr = get_uniform_rand_array<T>(ARRSIZE);
+    arr_bkp = arr;
+
+    /* call avx512 quickselect */
+    for (auto _ : state) {
+        avx512_qselect<T>(arr.data(), K, ARRSIZE);
+
+        state.PauseTiming();
+        arr = arr_bkp;
+        state.ResumeTiming();
+    }
+}
+
+template <typename T>
+static void stdnthelement(benchmark::State& state) {
+    // Perform setup here
+    int64_t K = state.range(0);
+    size_t ARRSIZE = 10000;
+    std::vector<T> arr;
+    std::vector<T> arr_bkp;
+
+    /* Initialize elements */
+    arr = get_uniform_rand_array<T>(ARRSIZE);
+    arr_bkp = arr;
+
+    /* call std::nth_element */
+    for (auto _ : state) {
+        std::nth_element(arr.begin(), arr.begin() + K, arr.end());
+
+        state.PauseTiming();
+        arr = arr_bkp;
+        state.ResumeTiming();
+    }
+}
+
+// Register the function as a benchmark
+BENCHMARK(avx512_qselect<float>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdnthelement<float>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_qselect<uint32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdnthelement<uint32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_qselect<int32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdnthelement<int32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+
+BENCHMARK(avx512_qselect<double>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdnthelement<double>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_qselect<uint64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdnthelement<uint64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_qselect<int64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdnthelement<int64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+
+//BENCHMARK(avx512_qselect<float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_qselect<uint16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdnthelement<uint16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(avx512_qselect<int16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdnthelement<int16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
diff --git a/benchmarks/bench_qsort.cpp b/benchmarks/bench_qsort.cpp
index 3f622fc3..d14bf218 100644
--- a/benchmarks/bench_qsort.cpp
+++ b/benchmarks/bench_qsort.cpp
@@ -1,73 +1,3 @@
-#include <benchmark/benchmark.h>
-#include "rand_array.h"
-#include "cpuinfo.h"
-#include "avx512-16bit-qsort.hpp"
-#include "avx512-32bit-qsort.hpp"
-#include "avx512-64bit-qsort.hpp"
-
-template <typename T>
-static void avx512_qsort(benchmark::State& state) {
-    if (!cpu_has_avx512bw()) {
-        state.SkipWithMessage("Requires AVX512 BW ISA");
-    }
-    if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) {
-        state.SkipWithMessage("Requires AVX512 VBMI2 ISA");
-    }
-    // Perform setup here
-    size_t ARRSIZE = state.range(0);
-    std::vector<T> arr;
-    std::vector<T> arr_bkp;
-
-    /* Initialize elements is reverse order */
-    arr = get_uniform_rand_array<T>(ARRSIZE);
-    arr_bkp = arr;
-
-    /* call avx512 quicksort */
-    for (auto _ : state) {
-        avx512_qsort<T>(arr.data(), ARRSIZE);
-        state.PauseTiming();
-        arr = arr_bkp;
-        state.ResumeTiming();
-    }
-}
-
-template <typename T>
-static void stdsort(benchmark::State& state) {
-    // Perform setup here
-    size_t ARRSIZE = state.range(0);
-    std::vector<T> arr;
-    std::vector<T> arr_bkp;
-
-    /* Initialize elements is reverse order */
-    arr = get_uniform_rand_array<T>(ARRSIZE);
-    arr_bkp = arr;
-
-    /* call avx512 quicksort */
-    for (auto _ : state) {
-        std::sort(arr.begin(), arr.end());
-        state.PauseTiming();
-        arr = arr_bkp;
-        state.ResumeTiming();
-    }
-}
-
-// Register the function as a benchmark
-BENCHMARK(avx512_qsort<float>)->Arg(10000)->Arg(1000000);
-BENCHMARK(stdsort<float>)->Arg(10000)->Arg(1000000);
-BENCHMARK(avx512_qsort<uint32_t>)->Arg(10000)->Arg(1000000);
-BENCHMARK(stdsort<uint32_t>)->Arg(10000)->Arg(1000000);
-BENCHMARK(avx512_qsort<int32_t>)->Arg(10000)->Arg(1000000);
-BENCHMARK(stdsort<int32_t>)->Arg(10000)->Arg(1000000);
-
-BENCHMARK(avx512_qsort<double>)->Arg(10000)->Arg(1000000);
-BENCHMARK(stdsort<double>)->Arg(10000)->Arg(1000000);
-BENCHMARK(avx512_qsort<uint64_t>)->Arg(10000)->Arg(1000000);
-BENCHMARK(stdsort<uint64_t>)->Arg(10000)->Arg(1000000);
-BENCHMARK(avx512_qsort<int64_t>)->Arg(10000)->Arg(1000000);
-BENCHMARK(stdsort<int64_t>)->Arg(10000)->Arg(10000000);
-
-//BENCHMARK(avx512_qsort<float16>)->Arg(10000)->Arg(1000000);
-BENCHMARK(avx512_qsort<uint16_t>)->Arg(10000)->Arg(1000000);
-BENCHMARK(stdsort<uint16_t>)->Arg(10000)->Arg(1000000);
-BENCHMARK(avx512_qsort<int16_t>)->Arg(10000)->Arg(1000000);
-BENCHMARK(stdsort<int16_t>)->Arg(10000)->Arg(10000000);
+#include "bench_qsort.hpp"
+#include "bench_qselect.hpp"
+#include "bench_partial_qsort.hpp"
diff --git a/benchmarks/bench_qsort.hpp b/benchmarks/bench_qsort.hpp
new file mode 100644
index 00000000..6659fdae
--- /dev/null
+++ b/benchmarks/bench_qsort.hpp
@@ -0,0 +1,68 @@
+#include "bench-qsort-common.h"
+
+template <typename T>
+static void avx512_qsort(benchmark::State& state) {
+    if (!cpu_has_avx512bw()) {
+        state.SkipWithMessage("Requires AVX512 BW ISA");
+    }
+    if ((sizeof(T) == 2) && (!cpu_has_avx512_vbmi2())) {
+        state.SkipWithMessage("Requires AVX512 VBMI2 ISA");
+    }
+    // Perform setup here
+    size_t ARRSIZE = state.range(0);
+    std::vector<T> arr;
+    std::vector<T> arr_bkp;
+
+    /* Initialize elements */
+    arr = get_uniform_rand_array<T>(ARRSIZE);
+    arr_bkp = arr;
+
+    /* call avx512 quicksort */
+    for (auto _ : state) {
+        avx512_qsort<T>(arr.data(), ARRSIZE);
+        state.PauseTiming();
+        arr = arr_bkp;
+        state.ResumeTiming();
+    }
+}
+
+template <typename T>
+static void stdsort(benchmark::State& state) {
+    // Perform setup here
+    size_t ARRSIZE = state.range(0);
+    std::vector<T> arr;
+    std::vector<T> arr_bkp;
+
+    /* Initialize elements */
+    arr = get_uniform_rand_array<T>(ARRSIZE);
+    arr_bkp = arr;
+
+    /* call std::sort */
+    for (auto _ : state) {
+        std::sort(arr.begin(), arr.end());
+        state.PauseTiming();
+        arr = arr_bkp;
+        state.ResumeTiming();
+    }
+}
+
+// Register the function as a benchmark
+BENCHMARK(avx512_qsort<float>)->Arg(10000)->Arg(1000000);
+BENCHMARK(stdsort<float>)->Arg(10000)->Arg(1000000);
+BENCHMARK(avx512_qsort<uint32_t>)->Arg(10000)->Arg(1000000);
+BENCHMARK(stdsort<uint32_t>)->Arg(10000)->Arg(1000000);
+BENCHMARK(avx512_qsort<int32_t>)->Arg(10000)->Arg(1000000);
+BENCHMARK(stdsort<int32_t>)->Arg(10000)->Arg(1000000);
+
+BENCHMARK(avx512_qsort<double>)->Arg(10000)->Arg(1000000);
+BENCHMARK(stdsort<double>)->Arg(10000)->Arg(1000000);
+BENCHMARK(avx512_qsort<uint64_t>)->Arg(10000)->Arg(1000000);
+BENCHMARK(stdsort<uint64_t>)->Arg(10000)->Arg(1000000);
+BENCHMARK(avx512_qsort<int64_t>)->Arg(10000)->Arg(1000000);
+BENCHMARK(stdsort<int64_t>)->Arg(10000)->Arg(1000000);
+
+//BENCHMARK(avx512_qsort<float16>)->Arg(10000)->Arg(1000000);
+BENCHMARK(avx512_qsort<uint16_t>)->Arg(10000)->Arg(1000000);
+BENCHMARK(stdsort<uint16_t>)->Arg(10000)->Arg(1000000);
+BENCHMARK(avx512_qsort<int16_t>)->Arg(10000)->Arg(1000000);
+BENCHMARK(stdsort<int16_t>)->Arg(10000)->Arg(1000000);
diff --git a/benchmarks/bench_qsortfp16.cpp b/benchmarks/bench_qsortfp16.cpp
index 0b454f7e..eddd876d 100644
--- a/benchmarks/bench_qsortfp16.cpp
+++ b/benchmarks/bench_qsortfp16.cpp
@@ -45,7 +45,7 @@ static void stdsort(benchmark::State& state) {
         }
         arr_bkp = arr;
 
-        /* call avx512 quicksort */
+        /* call std::sort */
         for (auto _ : state) {
             std::sort(arr.begin(), arr.end());
             state.PauseTiming();
@@ -61,3 +61,131 @@ static void stdsort(benchmark::State& state) {
 // Register the function as a benchmark
 BENCHMARK(avx512_qsort<_Float16>)->Arg(10000)->Arg(1000000);
 BENCHMARK(stdsort<_Float16>)->Arg(10000)->Arg(1000000);
+
+template <typename T>
+static void avx512_qselect(benchmark::State& state) {
+    if (cpu_has_avx512fp16()) {
+        // Perform setup here
+        int64_t K = state.range(0);
+        size_t ARRSIZE = 10000;
+        std::vector<T> arr;
+        std::vector<T> arr_bkp;
+
+        /* Initialize elements */
+        for (size_t jj = 0; jj < ARRSIZE; ++jj) {
+            _Float16 temp = (float) rand() / (float)(RAND_MAX);
+            arr.push_back(temp);
+        }
+        arr_bkp = arr;
+
+        /* call avx512 quickselect */
+        for (auto _ : state) {
+            avx512_qselect<T>(arr.data(), K, ARRSIZE);
+
+            state.PauseTiming();
+            arr = arr_bkp;
+            state.ResumeTiming();
+        }
+    }
+    else {
+        state.SkipWithMessage("Requires AVX512-FP16 ISA");
+    }
+}
+
+template <typename T>
+static void stdnthelement(benchmark::State& state) {
+    if (cpu_has_avx512fp16()) {
+        // Perform setup here
+        int64_t K = state.range(0);
+        size_t ARRSIZE = 10000;
+        std::vector<T> arr;
+        std::vector<T> arr_bkp;
+
+        /* Initialize elements */
+        for (size_t jj = 0; jj < ARRSIZE; ++jj) {
+            _Float16 temp = (float) rand() / (float)(RAND_MAX);
+            arr.push_back(temp);
+        }
+        arr_bkp = arr;
+
+        /* call std::nth_element */
+        for (auto _ : state) {
+            std::nth_element(arr.begin(), arr.begin() + K, arr.end());
+
+            state.PauseTiming();
+            arr = arr_bkp;
+            state.ResumeTiming();
+        }
+    }
+    else {
+        state.SkipWithMessage("Requires AVX512-FP16 ISA");
+    }
+}
+
+// Register the function as a benchmark
+BENCHMARK(avx512_qselect<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdnthelement<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+
+template <typename T>
+static void avx512_partial_qsort(benchmark::State& state) {
+    if (cpu_has_avx512fp16()) {
+        // Perform setup here
+        int64_t K = state.range(0);
+        size_t ARRSIZE = 10000;
+        std::vector<T> arr;
+        std::vector<T> arr_bkp;
+
+        /* Initialize elements */
+        for (size_t jj = 0; jj < ARRSIZE; ++jj) {
+            _Float16 temp = (float) rand() / (float)(RAND_MAX);
+            arr.push_back(temp);
+        }
+        arr_bkp = arr;
+
+        /* call avx512_partial_qsort */
+        for (auto _ : state) {
+            avx512_partial_qsort<T>(arr.data(), K, ARRSIZE);
+
+            state.PauseTiming();
+            arr = arr_bkp;
+            state.ResumeTiming();
+        }
+    }
+    else {
+        state.SkipWithMessage("Requires AVX512-FP16 ISA");
+    }
+}
+
+template <typename T>
+static void stdpartialsort(benchmark::State& state) {
+    if (cpu_has_avx512fp16()) {
+        // Perform setup here
+        int64_t K = state.range(0);
+        size_t ARRSIZE = 10000;
+        std::vector<T> arr;
+        std::vector<T> arr_bkp;
+
+        /* Initialize elements */
+        for (size_t jj = 0; jj < ARRSIZE; ++jj) {
+            _Float16 temp = (float) rand() / (float)(RAND_MAX);
+            arr.push_back(temp);
+        }
+        arr_bkp = arr;
+
+        /* call std::partial_sort */
+        for (auto _ : state) {
+            std::partial_sort(arr.begin(), arr.begin() + K, arr.end());
+
+            state.PauseTiming();
+            arr = arr_bkp;
+            state.ResumeTiming();
+        }
+    }
+    else {
+        state.SkipWithMessage("Requires AVX512-FP16 ISA");
+    }
+}
+
+// Register the function as a benchmark
+BENCHMARK(avx512_partial_qsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
+BENCHMARK(stdpartialsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
diff --git a/meson.build b/meson.build
index 79b23927..9193776d 100644
--- a/meson.build
+++ b/meson.build
@@ -11,8 +11,8 @@ gbench_dep = dependency('benchmark', required : true)
 
 fp16code = '''#include<immintrin.h>
 int main() { 
-  __mm512h temp = _mm512_set1_ph(1.0f);
-  __mm512h var2 = _mm512_min_ph(temp, temp);
+  __m512h temp = _mm512_set1_ph(1.0f);
+  __m512h var2 = _mm512_min_ph(temp, temp);
   return 0;
 }
 '''
diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h
index 7ab22123..6e0743d6 100644
--- a/src/avx512-16bit-common.h
+++ b/src/avx512-16bit-common.h
@@ -289,4 +289,36 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
         qsort_16bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
+template <typename vtype, typename type_t>
+static void
+qselect_16bit_(type_t *arr, int64_t pos,
+               int64_t left, int64_t right,
+               int64_t max_iters)
+{
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1, comparison_func<vtype>);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <= 128
+     */
+    if (right + 1 - left <= 128) {
+        sort_128_16bit<vtype>(arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_16bit<vtype>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512<vtype>(
+            arr, left, right + 1, pivot, &smallest, &biggest);
+    if ((pivot != smallest) && (pos < pivot_index))
+        qselect_16bit_<vtype>(arr, pos, left, pivot_index - 1, max_iters - 1);
+    else if ((pivot != biggest) && (pos >= pivot_index))
+        qselect_16bit_<vtype>(arr, pos, pivot_index, right, max_iters - 1);
+}
+
 #endif // AVX512_16BIT_COMMON
diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp
index fcbaf879..606f8706 100644
--- a/src/avx512-16bit-qsort.hpp
+++ b/src/avx512-16bit-qsort.hpp
@@ -405,6 +405,34 @@ replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count)
     }
 }
 
+template <>
+void avx512_qselect(int16_t *arr, int64_t k, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qselect_16bit_<zmm_vector<int16_t>, int16_t>(
+                arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qselect(uint16_t *arr, int64_t k, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qselect_16bit_<zmm_vector<uint16_t>, uint16_t>(
+                arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qselect_16bit_<zmm_vector<float16>, uint16_t>(
+                arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
+
 template <>
 void avx512_qsort(int16_t *arr, int64_t arrsize)
 {
@@ -432,4 +460,5 @@ void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize)
         replace_inf_with_nan(arr, arrsize, nan_count);
     }
 }
+
 #endif // AVX512_QSORT_16BIT
diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp
index 1cbba00b..e9e97aa1 100644
--- a/src/avx512-32bit-qsort.hpp
+++ b/src/avx512-32bit-qsort.hpp
@@ -656,6 +656,38 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
         qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
+template <typename vtype, typename type_t>
+static void
+qselect_32bit_(type_t *arr, int64_t pos,
+               int64_t left, int64_t right,
+               int64_t max_iters)
+{
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <= 128
+     */
+    if (right + 1 - left <= 128) {
+        sort_128_32bit<vtype>(arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512<vtype>(
+            arr, left, right + 1, pivot, &smallest, &biggest);
+    if ((pivot != smallest) && (pos < pivot_index))
+        qselect_32bit_<vtype>(arr, pos, left, pivot_index - 1, max_iters - 1);
+    else if ((pivot != biggest) && (pos >= pivot_index))
+        qselect_32bit_<vtype>(arr, pos, pivot_index, right, max_iters - 1);
+}
+
 X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
 {
     int64_t nan_count = 0;
@@ -681,6 +713,35 @@ replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count)
     }
 }
 
+template <>
+void avx512_qselect<int32_t>(int32_t *arr, int64_t k, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qselect_32bit_<zmm_vector<int32_t>, int32_t>(
+                arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qselect<uint32_t>(uint32_t *arr, int64_t k, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qselect_32bit_<zmm_vector<uint32_t>, uint32_t>(
+                arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qselect<float>(float *arr, int64_t k, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qselect_32bit_<zmm_vector<float>, float>(
+                arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
+
 template <>
 void avx512_qsort<int32_t>(int32_t *arr, int64_t arrsize)
 {
diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h
index 32a4731e..7fc8acf3 100644
--- a/src/avx512-64bit-common.h
+++ b/src/avx512-64bit-common.h
@@ -4,10 +4,16 @@
  * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
  * ****************************************************************/
 
-#ifndef AVX512_64BIT_COMMOM
-#define AVX512_64BIT_COMMOM
+#ifndef AVX512_64BIT_COMMON
+#define AVX512_64BIT_COMMON
 #include "avx512-common-qsort.h"
 
+/*
+ * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic
+ * sorting network (see
+ * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
+ */
+// ZMM                  7, 6, 5, 4, 3, 2, 1, 0
 #define NETWORK_64BIT_1 4, 5, 6, 7, 0, 1, 2, 3
 #define NETWORK_64BIT_2 0, 1, 2, 3, 4, 5, 6, 7
 #define NETWORK_64BIT_3 5, 4, 7, 6, 1, 0, 3, 2
diff --git a/src/avx512-64bit-qsort.hpp b/src/avx512-64bit-qsort.hpp
index 62000549..dfb5376f 100644
--- a/src/avx512-64bit-qsort.hpp
+++ b/src/avx512-64bit-qsort.hpp
@@ -9,13 +9,6 @@
 
 #include "avx512-64bit-common.h"
 
-/*
- * Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-// ZMM                  7, 6, 5, 4, 3, 2, 1, 0
-
 // Assumes zmm is bitonic and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
 X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
@@ -408,6 +401,67 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
         qsort_64bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
+template <typename vtype, typename type_t>
+static void
+qselect_64bit_(type_t *arr, int64_t pos,
+               int64_t left, int64_t right,
+               int64_t max_iters)
+{
+    /*
+     * Resort to std::sort if quicksort isnt making any progress
+     */
+    if (max_iters <= 0) {
+        std::sort(arr + left, arr + right + 1);
+        return;
+    }
+    /*
+     * Base case: use bitonic networks to sort arrays <= 128
+     */
+    if (right + 1 - left <= 128) {
+        sort_128_64bit<vtype>(arr + left, (int32_t)(right + 1 - left));
+        return;
+    }
+
+    type_t pivot = get_pivot_64bit<vtype>(arr, left, right);
+    type_t smallest = vtype::type_max();
+    type_t biggest = vtype::type_min();
+    int64_t pivot_index = partition_avx512<vtype>(
+            arr, left, right + 1, pivot, &smallest, &biggest);
+    if ((pivot != smallest) && (pos < pivot_index))
+        qselect_64bit_<vtype>(arr, pos, left, pivot_index - 1, max_iters - 1);
+    else if ((pivot != biggest) && (pos >= pivot_index))
+        qselect_64bit_<vtype>(arr, pos, pivot_index, right, max_iters - 1);
+}
+
+template <>
+void avx512_qselect<int64_t>(int64_t *arr, int64_t k, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qselect_64bit_<zmm_vector<int64_t>, int64_t>(
+                arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qselect<uint64_t>(uint64_t *arr, int64_t k, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        qselect_64bit_<zmm_vector<uint64_t>, uint64_t>(
+                arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+    }
+}
+
+template <>
+void avx512_qselect<double>(double *arr, int64_t k, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qselect_64bit_<zmm_vector<double>, double>(
+                arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
+
 template <>
 void avx512_qsort<int64_t>(int64_t *arr, int64_t arrsize)
 {
diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h
index a80f2721..5b6591f0 100644
--- a/src/avx512-common-qsort.h
+++ b/src/avx512-common-qsort.h
@@ -88,9 +88,23 @@ struct zmm_vector;
 
 template <typename T>
 void avx512_qsort(T *arr, int64_t arrsize);
-
 void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize);
 
+template <typename T>
+void avx512_qselect(T *arr, int64_t k, int64_t arrsize);
+void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize);
+
+template <typename T>
+inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) {
+    avx512_qselect<T>(arr, k - 1, arrsize);
+    avx512_qsort<T>(arr, k - 1);
+}
+inline void avx512_partial_qsort_fp16(uint16_t *arr, int64_t k, int64_t arrsize)
+{
+    avx512_qselect_fp16(arr, k - 1, arrsize);
+    avx512_qsort_fp16(arr, k - 1);
+}
+
 template <typename vtype, typename T = typename vtype::type_t>
 bool comparison_func(const T &a, const T &b)
 {
@@ -117,8 +131,8 @@ static inline zmm_t cmp_merge(zmm_t in1, zmm_t in2, opmask_t mask)
     return vtype::mask_mov(min, mask, max); // 0 -> min, 1 -> max
 }
 /*
- * Parition one ZMM register based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
+ * Parition one ZMM register based on the pivot and returns the
+ * number of elements that are greater than or equal to the pivot.
  */
 template <typename vtype, typename type_t, typename zmm_t>
 static inline int32_t partition_vec(type_t *arr,
@@ -129,20 +143,20 @@ static inline int32_t partition_vec(type_t *arr,
                                     zmm_t *smallest_vec,
                                     zmm_t *biggest_vec)
 {
-    /* which elements are larger than the pivot */
-    typename vtype::opmask_t gt_mask = vtype::ge(curr_vec, pivot_vec);
-    int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask);
+    /* which elements are larger than or equal to the pivot */
+    typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
+    int32_t amount_ge_pivot = _mm_popcnt_u32((int32_t)ge_mask);
     vtype::mask_compressstoreu(
-            arr + left, vtype::knot_opmask(gt_mask), curr_vec);
+            arr + left, vtype::knot_opmask(ge_mask), curr_vec);
     vtype::mask_compressstoreu(
-            arr + right - amount_gt_pivot, gt_mask, curr_vec);
+            arr + right - amount_ge_pivot, ge_mask, curr_vec);
     *smallest_vec = vtype::min(curr_vec, *smallest_vec);
     *biggest_vec = vtype::max(curr_vec, *biggest_vec);
-    return amount_gt_pivot;
+    return amount_ge_pivot;
 }
 /*
  * Parition an array based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
+ * first element that is greater than or equal to the pivot.
  */
 template <typename vtype, typename type_t>
 static inline int64_t partition_avx512(type_t *arr,
@@ -174,7 +188,7 @@ static inline int64_t partition_avx512(type_t *arr,
 
     if (right - left == vtype::numlanes) {
         zmm_t vec = vtype::loadu(arr + left);
-        int32_t amount_gt_pivot = partition_vec<vtype>(arr,
+        int32_t amount_ge_pivot = partition_vec<vtype>(arr,
                                                        left,
                                                        left + vtype::numlanes,
                                                        vec,
@@ -183,7 +197,7 @@ static inline int64_t partition_avx512(type_t *arr,
                                                        &max_vec);
         *smallest = vtype::reducemin(min_vec);
         *biggest = vtype::reducemax(max_vec);
-        return left + (vtype::numlanes - amount_gt_pivot);
+        return left + (vtype::numlanes - amount_ge_pivot);
     }
 
     // first and last vtype::numlanes values are partitioned at the end
@@ -211,7 +225,7 @@ static inline int64_t partition_avx512(type_t *arr,
             left += vtype::numlanes;
         }
         // partition the current vector and save it on both sides of the array
-        int32_t amount_gt_pivot
+        int32_t amount_ge_pivot
                 = partition_vec<vtype>(arr,
                                        l_store,
                                        r_store + vtype::numlanes,
@@ -220,27 +234,27 @@ static inline int64_t partition_avx512(type_t *arr,
                                        &min_vec,
                                        &max_vec);
         ;
-        r_store -= amount_gt_pivot;
-        l_store += (vtype::numlanes - amount_gt_pivot);
+        r_store -= amount_ge_pivot;
+        l_store += (vtype::numlanes - amount_ge_pivot);
     }
 
     /* partition and save vec_left and vec_right */
-    int32_t amount_gt_pivot = partition_vec<vtype>(arr,
+    int32_t amount_ge_pivot = partition_vec<vtype>(arr,
                                                    l_store,
                                                    r_store + vtype::numlanes,
                                                    vec_left,
                                                    pivot_vec,
                                                    &min_vec,
                                                    &max_vec);
-    l_store += (vtype::numlanes - amount_gt_pivot);
-    amount_gt_pivot = partition_vec<vtype>(arr,
+    l_store += (vtype::numlanes - amount_ge_pivot);
+    amount_ge_pivot = partition_vec<vtype>(arr,
                                            l_store,
                                            l_store + vtype::numlanes,
                                            vec_right,
                                            pivot_vec,
                                            &min_vec,
                                            &max_vec);
-    l_store += (vtype::numlanes - amount_gt_pivot);
+    l_store += (vtype::numlanes - amount_ge_pivot);
     *smallest = vtype::reducemin(min_vec);
     *biggest = vtype::reducemax(max_vec);
     return l_store;
diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp
index 363d2b55..8a9a49ed 100644
--- a/src/avx512fp16-16bit-qsort.hpp
+++ b/src/avx512fp16-16bit-qsort.hpp
@@ -144,6 +144,17 @@ replace_inf_with_nan(_Float16 *arr, int64_t arrsize, int64_t nan_count)
     memset(arr + arrsize - nan_count, 0xFF, nan_count * 2);
 }
 
+template <>
+void avx512_qselect(_Float16 *arr, int64_t k, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf(arr, arrsize);
+        qselect_16bit_<zmm_vector<_Float16>, _Float16>(
+                arr, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
+
 template <>
 void avx512_qsort(_Float16 *arr, int64_t arrsize)
 {
diff --git a/tests/meson.build b/tests/meson.build
index 0a82d96f..a69222ac 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -11,7 +11,7 @@ endif
 
 if cpp.has_argument('-march=icelake-client')
   libtests += static_library('tests_qsort',
-    files('test_qsort.cpp', ),
+    files('test_sort.cpp', ),
     dependencies: gtest_dep,
     include_directories : [src, utils],
     cpp_args : ['-O3', '-march=icelake-client'],
diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h
new file mode 100644
index 00000000..a41b2c63
--- /dev/null
+++ b/tests/test-qsort-common.h
@@ -0,0 +1,11 @@
+#ifndef AVX512_TEST_COMMON
+#define AVX512_TEST_COMMON
+
+#include "avx512-16bit-qsort.hpp"
+#include "avx512-32bit-qsort.hpp"
+#include "avx512-64bit-qsort.hpp"
+#include "cpuinfo.h"
+#include "rand_array.h"
+#include <gtest/gtest.h>
+
+#endif
diff --git a/tests/test_partial_qsort.hpp b/tests/test_partial_qsort.hpp
new file mode 100644
index 00000000..5c08064e
--- /dev/null
+++ b/tests/test_partial_qsort.hpp
@@ -0,0 +1,49 @@
+#include "test-qsort-common.h"
+
+template <typename T>
+class avx512_partial_sort : public ::testing::Test {
+};
+TYPED_TEST_SUITE_P(avx512_partial_sort);
+
+TYPED_TEST_P(avx512_partial_sort, test_ranges)
+{
+    int64_t arrsize = 1024;
+    int64_t nranges = 500;
+
+    if (cpu_has_avx512bw()) {
+        if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
+            GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
+        }
+        std::vector<TypeParam> arr;
+        std::vector<TypeParam> sortedarr;
+        std::vector<TypeParam> psortedarr;
+        /* Random array */
+        arr = get_uniform_rand_array<TypeParam>(arrsize);
+        sortedarr = arr;
+        /* Sort with std::sort for comparison */
+        std::sort(sortedarr.begin(), sortedarr.end());
+
+        for (size_t ii = 0; ii < nranges; ++ii) {
+            psortedarr = arr;
+
+            /* Pick a random number of elements to sort at the beginning of the array */
+            int k = get_uniform_rand_array<int64_t>(1, arrsize, 1).front();
+
+            /* Sort the range and verify all the required elements match the presorted set */
+            avx512_partial_qsort<TypeParam>(psortedarr.data(), k, psortedarr.size());
+            for (size_t jj = 0; jj < k; jj++) {
+                ASSERT_EQ(sortedarr[jj], psortedarr[jj]);
+            }
+
+            psortedarr.clear();
+        }
+
+        arr.clear();
+        sortedarr.clear();
+    }
+    else {
+        GTEST_SKIP() << "Skipping this test, it requires avx512bw";
+    }
+}
+
+REGISTER_TYPED_TEST_SUITE_P(avx512_partial_sort, test_ranges);
diff --git a/tests/test_qselect.hpp b/tests/test_qselect.hpp
new file mode 100644
index 00000000..cad017bb
--- /dev/null
+++ b/tests/test_qselect.hpp
@@ -0,0 +1,51 @@
+#include "test-qsort-common.h"
+
+template <typename T>
+class avx512_select : public ::testing::Test {
+};
+TYPED_TEST_SUITE_P(avx512_select);
+
+TYPED_TEST_P(avx512_select, test_arrsizes)
+{
+    if (cpu_has_avx512bw()) {
+        if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
+            GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
+        }
+        std::vector<int64_t> arrsizes;
+        for (int64_t ii = 0; ii < 1024; ++ii) {
+            arrsizes.push_back(ii);
+        }
+        std::vector<TypeParam> arr;
+        std::vector<TypeParam> sortedarr;
+        std::vector<TypeParam> psortedarr;
+        for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
+            /* Random array */
+            arr = get_uniform_rand_array<TypeParam>(arrsizes[ii]);
+            sortedarr = arr;
+            /* Sort with std::sort for comparison */
+            std::sort(sortedarr.begin(), sortedarr.end());
+            for (size_t k = 0; k < arr.size(); ++k) {
+                psortedarr = arr;
+                avx512_qselect<TypeParam>(psortedarr.data(), k, psortedarr.size());
+                /* index k is correct */
+                ASSERT_EQ(sortedarr[k], psortedarr[k]);
+                /* Check left partition */
+                for (size_t jj = 0; jj < k; jj++) {
+                    ASSERT_LE(psortedarr[jj], psortedarr[k]);
+                }
+                /* Check right partition */
+                for (size_t jj = k+1; jj < arr.size(); jj++) {
+                    ASSERT_GE(psortedarr[jj], psortedarr[k]);
+                }
+                psortedarr.clear();
+            }
+            arr.clear();
+            sortedarr.clear();
+        }
+    }
+    else {
+        GTEST_SKIP() << "Skipping this test, it requires avx512bw";
+    }
+}
+
+REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_arrsizes);
diff --git a/tests/test_qsort.cpp b/tests/test_qsort.hpp
similarity index 70%
rename from tests/test_qsort.cpp
rename to tests/test_qsort.hpp
index 6d82a35b..65a8eaf6 100644
--- a/tests/test_qsort.cpp
+++ b/tests/test_qsort.hpp
@@ -3,13 +3,7 @@
  * * SPDX-License-Identifier: BSD-3-Clause
  * *******************************************/
 
-#include "avx512-16bit-qsort.hpp"
-#include "avx512-32bit-qsort.hpp"
-#include "avx512-64bit-qsort.hpp"
-#include "cpuinfo.h"
-#include "rand_array.h"
-#include <gtest/gtest.h>
-#include <vector>
+#include "test-qsort-common.h"
 
 template <typename T>
 class avx512_sort : public ::testing::Test {
@@ -46,13 +40,3 @@ TYPED_TEST_P(avx512_sort, test_arrsizes)
 }
 
 REGISTER_TYPED_TEST_SUITE_P(avx512_sort, test_arrsizes);
-
-using Types = testing::Types<uint16_t,
-                             int16_t,
-                             float,
-                             double,
-                             uint32_t,
-                             int32_t,
-                             uint64_t,
-                             int64_t>;
-INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_sort, Types);
diff --git a/tests/test_qsortfp16.cpp b/tests/test_qsortfp16.cpp
index ab5c10fe..f86d77df 100644
--- a/tests/test_qsortfp16.cpp
+++ b/tests/test_qsortfp16.cpp
@@ -72,3 +72,88 @@ TEST(avx512_qsort_float16, test_special_floats)
         GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA";
     }
 }
+
+TEST(avx512_qselect_float16, test_arrsizes)
+{
+    if (cpu_has_avx512fp16()) {
+        std::vector<int64_t> arrsizes;
+        for (int64_t ii = 0; ii < 1024; ++ii) {
+            arrsizes.push_back(ii);
+        }
+        std::vector<_Float16> arr;
+        std::vector<_Float16> sortedarr;
+        std::vector<_Float16> psortedarr;
+
+        for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
+            /* Random array */
+            for (size_t jj = 0; jj < arrsizes[ii]; ++jj) {
+                _Float16 temp = (float)rand() / (float)(RAND_MAX);
+                arr.push_back(temp);
+                sortedarr.push_back(temp);
+            }
+            /* Sort with std::sort for comparison */
+            std::sort(sortedarr.begin(), sortedarr.end());
+            for (size_t k = 0; k < arr.size(); ++k) {
+                psortedarr = arr;
+                avx512_qselect<_Float16>(psortedarr.data(), k, psortedarr.size());
+                /* index k is correct */
+                ASSERT_EQ(sortedarr[k], psortedarr[k]);
+                /* Check left partition */
+                for (size_t jj = 0; jj < k; jj++) {
+                    ASSERT_LE(psortedarr[jj], psortedarr[k]);
+                }
+                /* Check right partition */
+                for (size_t jj = k+1; jj < arr.size(); jj++) {
+                    ASSERT_GE(psortedarr[jj], psortedarr[k]);
+                }
+                psortedarr.clear();
+            }
+            arr.clear();
+            sortedarr.clear();
+        }
+    }
+    else {
+        GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA";
+    }
+}
+
+TEST(avx512_partial_qsort_float16, test_ranges)
+{
+    if (cpu_has_avx512fp16()) {
+        int64_t arrsize = 1024;
+        int64_t nranges = 500;
+
+        std::vector<_Float16> arr;
+        std::vector<_Float16> sortedarr;
+        std::vector<_Float16> psortedarr;
+
+        /* Random array */
+        for (size_t ii = 0; ii < arrsize; ++ii) {
+            _Float16 temp = (float)rand() / (float)(RAND_MAX);
+            arr.push_back(temp);
+            sortedarr.push_back(temp);
+        }
+        /* Sort with std::sort for comparison */
+        std::sort(sortedarr.begin(), sortedarr.end());
+
+        for (size_t ii = 0; ii < nranges; ++ii) {
+            psortedarr = arr;
+
+            int k = get_uniform_rand_array<int64_t>(1, arrsize, 1).front();
+
+            /* Sort the range and verify all the required elements match the presorted set */
+            avx512_partial_qsort<_Float16>(psortedarr.data(), k, psortedarr.size());
+            for (size_t jj = 0; jj < k; jj++) {
+                ASSERT_EQ(sortedarr[jj], psortedarr[jj]);
+            }
+
+            psortedarr.clear();
+        }
+
+        arr.clear();
+        sortedarr.clear();
+    }
+    else {
+        GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA";
+    }
+}
diff --git a/tests/test_sort.cpp b/tests/test_sort.cpp
new file mode 100644
index 00000000..85a6bd8d
--- /dev/null
+++ b/tests/test_sort.cpp
@@ -0,0 +1,15 @@
+#include "test_qsort.hpp"
+#include "test_qselect.hpp"
+#include "test_partial_qsort.hpp"
+
+using QuickSortTestTypes = testing::Types<uint16_t,
+                                          int16_t,
+                                          float,
+                                          double,
+                                          uint32_t,
+                                          int32_t,
+                                          uint64_t,
+                                          int64_t>;
+INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_sort, QuickSortTestTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_select, QuickSortTestTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(TestPrefix, avx512_partial_sort, QuickSortTestTypes);