Skip to content

Commit fcd5f84

Browse files
committed
WIP: Fix support for Intel Compute Runtime
The fallback implementation of amd_bitalign() doesn't work with recent Intel Compute Runtime (NEO) when the destination is the same as one of the sources. The s1 component of the resulting vectors gets corrupted. Because of that, about half of self-tests fail with VectorSize=2, even though all tests pass with VectorSize=1. Add generic_bitalign() that is always implemented using shifts. Use 64-bit shifts for Intel NEO, 32-bit shifts for other platforms. Use generic_bitalign() instead of the equivalent shifts in all cases when the destination is the same as one of the sources.
1 parent ef462f1 commit fcd5f84

File tree

2 files changed

+44
-28
lines changed

2 files changed

+44
-28
lines changed

src/barrett.cl

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -253,27 +253,27 @@ Adding x*x to a few carries will not cascade the carry
253253

254254
void shl_96(int96_v * const a)
255255
/* shiftleft a one bit */
256-
{ /* here, bitalign improves the 92-bit kernel, and slows down 76-bit */
256+
{ /* here, amd_bitalign improves the 92-bit kernel, and slows down 76-bit */
257257
a->d2 = amd_bitalign(a->d2, a->d1, 31);
258258
a->d1 = amd_bitalign(a->d1, a->d0, 31);
259-
// a->d2 = (a->d2 << 1) | (a->d1 >> 31);
260-
// a->d1 = (a->d1 << 1) | (a->d0 >> 31);
259+
// a->d2 = generic_bitalign(a->d2, a->d1, 31);
260+
// a->d1 = generic_bitalign(a->d1, a->d0, 31);
261261
a->d0 = a->d0 << 1;
262262
}
263263

264264
void shl_192(int192_v * const a)
265265
/* shiftleft a one bit */
266-
{ /* in this function, bitalign slows down all kernels */
266+
{ /* in this function, amd_bitalign slows down all kernels */
267267
// a->d5 = amd_bitalign(a->d5, a->d4, 31);
268268
// a->d4 = amd_bitalign(a->d4, a->d3, 31);
269269
// a->d3 = amd_bitalign(a->d3, a->d2, 31);
270270
// a->d2 = amd_bitalign(a->d2, a->d1, 31);
271271
// a->d1 = amd_bitalign(a->d1, a->d0, 31);
272-
a->d5 = (a->d5 << 1) | (a->d4 >> 31);
273-
a->d4 = (a->d4 << 1) | (a->d3 >> 31);
274-
a->d3 = (a->d3 << 1) | (a->d2 >> 31);
275-
a->d2 = (a->d2 << 1) | (a->d1 >> 31);
276-
a->d1 = (a->d1 << 1) | (a->d0 >> 31);
272+
a->d5 = generic_bitalign(a->d5, a->d4, 31);
273+
a->d4 = generic_bitalign(a->d4, a->d3, 31);
274+
a->d3 = generic_bitalign(a->d3, a->d2, 31);
275+
a->d2 = generic_bitalign(a->d2, a->d1, 31);
276+
a->d1 = generic_bitalign(a->d1, a->d0, 31);
277277
a->d0 = a->d0 << 1;
278278
}
279279

@@ -442,12 +442,12 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const
442442

443443
// shiftleft nn 11 bits
444444
#ifndef DIV_160_96
445-
nn.d3 = (nn.d3 << 11) + (nn.d2 >> 21);
445+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 21);
446446
#endif
447447
nn.d2 = amd_bitalign(nn.d2, nn.d1, 21);
448448
nn.d1 = amd_bitalign(nn.d1, nn.d0, 21);
449-
// nn.d2 = (nn.d2 << 11) + (nn.d1 >> 21);
450-
// nn.d1 = (nn.d1 << 11) + (nn.d0 >> 21);
449+
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 21);
450+
// nn.d1 = generic_bitalign(nn.d1, nn.d0, 21);
451451
nn.d0 = nn.d0 << 11;
452452

453453
// q = q - nn
@@ -510,11 +510,11 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const
510510
nn.d4 = nn.d3 >> 9;
511511
#endif
512512
// nn.d3 = amd_bitalign(nn.d3, nn.d2, 9);
513-
nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
513+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 9);
514514
nn.d2 = amd_bitalign(nn.d2, nn.d1, 9);
515-
// nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
515+
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 9);
516516
// nn.d1 = amd_bitalign(nn.d1, nn.d0, 9);
517-
nn.d1 = (nn.d1 << 23) + (nn.d0 >> 9);
517+
nn.d1 = generic_bitalign(nn.d1, nn.d0, 9);
518518
nn.d0 = nn.d0 << 23;
519519

520520
// q = q - nn
@@ -642,9 +642,9 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const
642642
#ifdef CHECKS_MODBASECASE
643643
nn.d4 = nn.d3 >> 17;
644644
#endif
645-
nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
646-
nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
647-
nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
645+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 17);
646+
nn.d2 = generic_bitalign(nn.d2, nn.d1, 17);
647+
nn.d1 = generic_bitalign(nn.d1, nn.d0, 17);
648648
nn.d0 = nn.d0 << 15;
649649

650650
// q = q - nn
@@ -877,12 +877,12 @@ DIV_160_96 here. */
877877

878878
// shiftleft nn 11 bits
879879
#ifndef DIV_160_96
880-
nn.d3 = (nn.d3 << 11) + (nn.d2 >> 21);
880+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 21);
881881
#endif
882882
nn.d2 = amd_bitalign(nn.d2, nn.d1, 21);
883883
nn.d1 = amd_bitalign(nn.d1, nn.d0, 21);
884-
// nn.d2 = (nn.d2 << 11) + (nn.d1 >> 21);
885-
// nn.d1 = (nn.d1 << 11) + (nn.d0 >> 21);
884+
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 21);
885+
// nn.d1 = generic_bitalign(nn.d1, nn.d0, 21);
886886
nn.d0 = nn.d0 << 11;
887887

888888
// q = q - nn
@@ -945,11 +945,11 @@ DIV_160_96 here. */
945945
nn.d4 = nn.d3 >> 9;
946946
#endif
947947
// nn.d3 = amd_bitalign(nn.d3, nn.d2, 9);
948-
nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
948+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 9);
949949
nn.d2 = amd_bitalign(nn.d2, nn.d1, 9);
950-
// nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
950+
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 9);
951951
// nn.d1 = amd_bitalign(nn.d1, nn.d0, 9);
952-
nn.d1 = (nn.d1 << 23) + (nn.d0 >> 9);
952+
nn.d1 = generic_bitalign(nn.d1, nn.d0, 9);
953953
nn.d0 = nn.d0 << 23;
954954

955955
// q = q - nn
@@ -1077,9 +1077,9 @@ DIV_160_96 here. */
10771077
#ifdef CHECKS_MODBASECASE
10781078
nn.d4 = nn.d3 >> 17;
10791079
#endif
1080-
nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
1081-
nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
1082-
nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
1080+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 17);
1081+
nn.d2 = generic_bitalign(nn.d2, nn.d1, 17);
1082+
nn.d1 = generic_bitalign(nn.d1, nn.d0, 17);
10831083
nn.d0 = nn.d0 << 15;
10841084

10851085
// q = q - nn

src/common.cl

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,22 @@ uint popcount(uint x)
170170
#define ATOMIC_INC(x) ((x)++)
171171
#endif
172172

173+
// generic_bitalign emulates amd_bitalign using shifts. generic_bitalign can be
174+
// used instead of amd_bitalign if benchmarks show that it's faster.
175+
#ifdef cl_intel_subgroups
176+
// Workaround for Intel NEO that miscompiles shifts on uint vectors - use ulong instead
177+
inline uint_v generic_bitalign(const uint_v high, const uint_v low, const int shift)
178+
{
179+
return CONVERT_UINT_V(((CONVERT_ULONG_V(high) << 32) | CONVERT_ULONG_V(low)) >> shift);
180+
}
181+
#else
182+
// Non-Intel generic_bitalign implementation uses uint vectors
183+
inline uint_v generic_bitalign(const uint_v high, const uint_v low, const int shift)
184+
{
185+
return (high << (32 - shift)) | (low >> shift);
186+
}
187+
#endif
188+
173189
#ifdef cl_amd_media_ops
174190
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
175191
#else
@@ -180,7 +196,7 @@ uint popcount(uint x)
180196
// Description
181197
// dst.s0 = (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2.s0 & 31))
182198
// similar operation applied to other components of the vectors.
183-
#define amd_bitalign(src0, src1, src2) (src0 << (32-src2)) | (src1 >> src2)
199+
#define amd_bitalign(src0, src1, src2) generic_bitalign(src0, src1, src2)
184200
#endif
185201

186202
#ifdef cl_amd_media_ops2

0 commit comments

Comments
 (0)