Skip to content

Commit 45777bc

Browse files
author
Raghuveer Devulapalli
authored
Merge pull request #86 from r-devulap/np-ci-fixes
Fix NumPy CI failures
2 parents 0fad81f + 65e04b6 commit 45777bc

File tree

2 files changed

+26
-133
lines changed

2 files changed

+26
-133
lines changed

src/avx512-16bit-common.h

Lines changed: 20 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -99,38 +99,11 @@ struct avx512_16bit_swizzle_ops {
9999
__m512i v = vtype::cast_to(reg);
100100

101101
if constexpr (scale == 2) {
102-
__m512i mask = _mm512_set_epi16(30,
103-
31,
104-
28,
105-
29,
106-
26,
107-
27,
108-
24,
109-
25,
110-
22,
111-
23,
112-
20,
113-
21,
114-
18,
115-
19,
116-
16,
117-
17,
118-
14,
119-
15,
120-
12,
121-
13,
122-
10,
123-
11,
124-
8,
125-
9,
126-
6,
127-
7,
128-
4,
129-
5,
130-
2,
131-
3,
132-
0,
133-
1);
102+
std::vector<uint16_t> arr
103+
= {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
104+
10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
105+
23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
106+
__m512i mask = _mm512_loadu_si512(arr.data());
134107
v = _mm512_permutexvar_epi16(mask, v);
135108
}
136109
else if constexpr (scale == 4) {
@@ -160,108 +133,27 @@ struct avx512_16bit_swizzle_ops {
160133

161134
if constexpr (scale == 2) { return swap_n<vtype, 2>(reg); }
162135
else if constexpr (scale == 4) {
163-
__m512i mask = _mm512_set_epi16(28,
164-
29,
165-
30,
166-
31,
167-
24,
168-
25,
169-
26,
170-
27,
171-
20,
172-
21,
173-
22,
174-
23,
175-
16,
176-
17,
177-
18,
178-
19,
179-
12,
180-
13,
181-
14,
182-
15,
183-
8,
184-
9,
185-
10,
186-
11,
187-
4,
188-
5,
189-
6,
190-
7,
191-
0,
192-
1,
193-
2,
194-
3);
136+
std::vector<uint16_t> arr
137+
= {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
138+
8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
139+
21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
140+
__m512i mask = _mm512_loadu_si512(arr.data());
195141
v = _mm512_permutexvar_epi16(mask, v);
196142
}
197143
else if constexpr (scale == 8) {
198-
__m512i mask = _mm512_set_epi16(24,
199-
25,
200-
26,
201-
27,
202-
28,
203-
29,
204-
30,
205-
31,
206-
16,
207-
17,
208-
18,
209-
19,
210-
20,
211-
21,
212-
22,
213-
23,
214-
8,
215-
9,
216-
10,
217-
11,
218-
12,
219-
13,
220-
14,
221-
15,
222-
0,
223-
1,
224-
2,
225-
3,
226-
4,
227-
5,
228-
6,
229-
7);
144+
std::vector<uint16_t> arr
145+
= {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
146+
12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
147+
17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
148+
__m512i mask = _mm512_loadu_si512(arr.data());
230149
v = _mm512_permutexvar_epi16(mask, v);
231150
}
232151
else if constexpr (scale == 16) {
233-
__m512i mask = _mm512_set_epi16(16,
234-
17,
235-
18,
236-
19,
237-
20,
238-
21,
239-
22,
240-
23,
241-
24,
242-
25,
243-
26,
244-
27,
245-
28,
246-
29,
247-
30,
248-
31,
249-
0,
250-
1,
251-
2,
252-
3,
253-
4,
254-
5,
255-
6,
256-
7,
257-
8,
258-
9,
259-
10,
260-
11,
261-
12,
262-
13,
263-
14,
264-
15);
152+
std::vector<uint16_t> arr
153+
= {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
154+
4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26,
155+
25, 24, 23, 22, 21, 20, 19, 18, 17, 16};
156+
__m512i mask = _mm512_loadu_si512(arr.data());
265157
v = _mm512_permutexvar_epi16(mask, v);
266158
}
267159
else if constexpr (scale == 32) {

src/avx512-common-qsort.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <cstring>
4242
#include <immintrin.h>
4343
#include <limits>
44+
#include <vector>
4445

4546
#define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
4647
#define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
@@ -249,7 +250,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store,
249250
reg_t &biggest_vec)
250251
{
251252
typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
252-
arrsize_t amount_ge_pivot = _mm_popcnt_u64(ge_mask);
253+
int amount_ge_pivot = _mm_popcnt_u32((int)ge_mask);
253254

254255
vtype::mask_compressstoreu(l_store, vtype::knot_opmask(ge_mask), curr_vec);
255256
vtype::mask_compressstoreu(
@@ -450,17 +451,17 @@ X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
450451
X86_SIMD_SORT_UNROLL_LOOP(8)
451452
for (int ii = 0; ii < num_unroll; ++ii) {
452453
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
453-
_mm_prefetch(arr + right + ii * vtype::numlanes
454-
- num_unroll * vtype::numlanes,
454+
_mm_prefetch((char *)(arr + right + ii * vtype::numlanes
455+
- num_unroll * vtype::numlanes),
455456
_MM_HINT_T0);
456457
}
457458
}
458459
else {
459460
X86_SIMD_SORT_UNROLL_LOOP(8)
460461
for (int ii = 0; ii < num_unroll; ++ii) {
461462
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
462-
_mm_prefetch(arr + left + ii * vtype::numlanes
463-
+ num_unroll * vtype::numlanes,
463+
_mm_prefetch((char *)(arr + left + ii * vtype::numlanes
464+
+ num_unroll * vtype::numlanes),
464465
_MM_HINT_T0);
465466
}
466467
left += num_unroll * vtype::numlanes;

0 commit comments

Comments
 (0)