@@ -32083,10 +32083,8 @@ bool GenTree::CanDivOrModPossiblyOverflow(Compiler* comp) const
32083
32083
#if defined(FEATURE_HW_INTRINSICS)
32084
32084
GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
32085
32085
{
32086
- if (!opts.Tier0OptimizationEnabled())
32087
- {
32088
- return tree;
32089
- }
32086
+ assert(!optValnumCSE_phase);
32087
+ assert(opts.Tier0OptimizationEnabled());
32090
32088
32091
32089
NamedIntrinsic ni = tree->GetHWIntrinsicId();
32092
32090
var_types retType = tree->TypeGet();
@@ -32225,6 +32223,133 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
32225
32223
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
32226
32224
assert(oper != GT_AND_NOT);
32227
32225
32226
+ #if defined(FEATURE_MASKED_HW_INTRINSICS) && defined(TARGET_XARCH)
32227
+ if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper))
32228
+ {
32229
+ // Comparisons that produce masks lead to more verbose trees than
32230
+ // necessary in many scenarios due to requiring a CvtMaskToVector
32231
+ // node to be inserted over them and this can block various opts
32232
+ // that are dependent on tree height and similar. So we want to
32233
+ // fold the unnecessary back and forth conversions away where possible.
32234
+
32235
+ genTreeOps effectiveOper = oper;
32236
+ GenTree* actualOp2 = op2;
32237
+
32238
+ if (oper == GT_NOT)
32239
+ {
32240
+ assert(op2 == nullptr);
32241
+ op2 = op1;
32242
+ }
32243
+
32244
+ // We need both operands to be ConvertMaskToVector in
32245
+ // order to optimize this to a direct mask operation
32246
+
32247
+ if (!op1->OperIsConvertMaskToVector())
32248
+ {
32249
+ return tree;
32250
+ }
32251
+
32252
+ if (!op2->OperIsHWIntrinsic())
32253
+ {
32254
+ if ((oper != GT_XOR) || !op2->IsVectorAllBitsSet())
32255
+ {
32256
+ return tree;
32257
+ }
32258
+
32259
+ // We want to explicitly recognize op1 ^ AllBitsSet as
32260
+ // some platforms don't have direct support for ~op1
32261
+
32262
+ effectiveOper = GT_NOT;
32263
+ op2 = op1;
32264
+ }
32265
+
32266
+ GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic();
32267
+ GenTreeHWIntrinsic* cvtOp2 = op2->AsHWIntrinsic();
32268
+
32269
+ if (!cvtOp2->OperIsConvertMaskToVector())
32270
+ {
32271
+ return tree;
32272
+ }
32273
+
32274
+ unsigned simdBaseTypeSize = genTypeSize(simdBaseType);
32275
+
32276
+ if ((genTypeSize(cvtOp1->GetSimdBaseType()) != simdBaseTypeSize) ||
32277
+ (genTypeSize(cvtOp2->GetSimdBaseType()) != simdBaseTypeSize))
32278
+ {
32279
+ // We need both operands to be the same kind of mask; otherwise
32280
+ // the bitwise operation can differ in how it performs
32281
+ return tree;
32282
+ }
32283
+
32284
+ NamedIntrinsic maskIntrinsicId = NI_Illegal;
32285
+
32286
+ switch (effectiveOper)
32287
+ {
32288
+ case GT_AND:
32289
+ {
32290
+ maskIntrinsicId = NI_AVX512_AndMask;
32291
+ break;
32292
+ }
32293
+
32294
+ case GT_NOT:
32295
+ {
32296
+ maskIntrinsicId = NI_AVX512_NotMask;
32297
+ break;
32298
+ }
32299
+
32300
+ case GT_OR:
32301
+ {
32302
+ maskIntrinsicId = NI_AVX512_OrMask;
32303
+ break;
32304
+ }
32305
+
32306
+ case GT_XOR:
32307
+ {
32308
+ maskIntrinsicId = NI_AVX512_XorMask;
32309
+ break;
32310
+ }
32311
+
32312
+ default:
32313
+ {
32314
+ unreached();
32315
+ }
32316
+ }
32317
+
32318
+ assert(maskIntrinsicId != NI_Illegal);
32319
+
32320
+ if (effectiveOper == oper)
32321
+ {
32322
+ tree->ChangeHWIntrinsicId(maskIntrinsicId);
32323
+ tree->Op(1) = cvtOp1->Op(1);
32324
+ }
32325
+ else
32326
+ {
32327
+ assert(effectiveOper == GT_NOT);
32328
+ tree->ResetHWIntrinsicId(maskIntrinsicId, this, cvtOp1->Op(1));
32329
+ tree->gtFlags &= ~GTF_REVERSE_OPS;
32330
+ }
32331
+
32332
+ tree->gtType = TYP_MASK;
32333
+ DEBUG_DESTROY_NODE(op1);
32334
+
32335
+ if (effectiveOper != GT_NOT)
32336
+ {
32337
+ tree->Op(2) = cvtOp2->Op(1);
32338
+ }
32339
+
32340
+ if (actualOp2 != nullptr)
32341
+ {
32342
+ DEBUG_DESTROY_NODE(actualOp2);
32343
+ }
32344
+ tree->SetMorphed(this);
32345
+
32346
+ tree = gtNewSimdCvtMaskToVectorNode(retType, tree, simdBaseJitType, simdSize)->AsHWIntrinsic();
32347
+ tree->SetMorphed(this);
32348
+
32349
+ return tree;
32350
+ }
32351
+ #endif // FEATURE_MASKED_HW_INTRINSICS && TARGET_XARCH
32352
+
32228
32353
GenTree* cnsNode = nullptr;
32229
32354
GenTree* otherNode = nullptr;
32230
32355
@@ -32762,10 +32887,28 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
32762
32887
oper = GT_NONE;
32763
32888
}
32764
32889
32890
+ // For mask nodes in particular, the foldings below are done under the presumption
32891
+ // that we only produce something like `AddMask(op1, op2)` if op1 and op2 are compatible
32892
+ // masks. On xarch, for example, this means that it'd be adding 8, 16, 32, or 64-bits
32893
+ // together with the same size. We wouldn't ever encounter something like an 8 and 16 bit
32894
+ // masks being added. This ensures that we don't end up with a case where folding would
32895
+ // cause a different result to be produced, such as because the remaining upper bits are
32896
+ // no longer zeroed.
32897
+
32765
32898
switch (oper)
32766
32899
{
32767
32900
case GT_ADD:
32768
32901
{
32902
+ if (varTypeIsMask(retType))
32903
+ {
32904
+ // Handle `x + 0 == x` and `0 + x == x`
32905
+ if (cnsNode->IsMaskZero())
32906
+ {
32907
+ resultNode = otherNode;
32908
+ }
32909
+ break;
32910
+ }
32911
+
32769
32912
if (varTypeIsFloating(simdBaseType))
32770
32913
{
32771
32914
// Handle `x + NaN == NaN` and `NaN + x == NaN`
@@ -32799,6 +32942,23 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
32799
32942
32800
32943
case GT_AND:
32801
32944
{
32945
+ if (varTypeIsMask(retType))
32946
+ {
32947
+ // Handle `x & 0 == 0` and `0 & x == 0`
32948
+ if (cnsNode->IsMaskZero())
32949
+ {
32950
+ resultNode = otherNode;
32951
+ break;
32952
+ }
32953
+
32954
+ // Handle `x & AllBitsSet == x` and `AllBitsSet & x == x`
32955
+ if (cnsNode->IsMaskAllBitsSet())
32956
+ {
32957
+ resultNode = otherNode;
32958
+ }
32959
+ break;
32960
+ }
32961
+
32802
32962
// Handle `x & 0 == 0` and `0 & x == 0`
32803
32963
if (cnsNode->IsVectorZero())
32804
32964
{
@@ -33032,6 +33192,23 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
33032
33192
33033
33193
case GT_OR:
33034
33194
{
33195
+ if (varTypeIsMask(retType))
33196
+ {
33197
+ // Handle `x | 0 == x` and `0 | x == x`
33198
+ if (cnsNode->IsMaskZero())
33199
+ {
33200
+ resultNode = otherNode;
33201
+ break;
33202
+ }
33203
+
33204
+ // Handle `x | AllBitsSet == AllBitsSet` and `AllBitsSet | x == AllBitsSet`
33205
+ if (cnsNode->IsMaskAllBitsSet())
33206
+ {
33207
+ resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
33208
+ }
33209
+ break;
33210
+ }
33211
+
33035
33212
// Handle `x | 0 == x` and `0 | x == x`
33036
33213
if (cnsNode->IsVectorZero())
33037
33214
{
@@ -33059,6 +33236,27 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
33059
33236
// Handle `x >> 0 == x` and `0 >> x == 0`
33060
33237
// Handle `x >>> 0 == x` and `0 >>> x == 0`
33061
33238
33239
+ if (varTypeIsMask(retType))
33240
+ {
33241
+ if (cnsNode->IsMaskZero())
33242
+ {
33243
+ if (cnsNode == op2)
33244
+ {
33245
+ resultNode = otherNode;
33246
+ }
33247
+ else
33248
+ {
33249
+ resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
33250
+ }
33251
+ }
33252
+ else if (cnsNode->IsIntegralConst(0))
33253
+ {
33254
+ assert(cnsNode == op2);
33255
+ resultNode = otherNode;
33256
+ }
33257
+ break;
33258
+ }
33259
+
33062
33260
if (cnsNode->IsVectorZero())
33063
33261
{
33064
33262
if (cnsNode == op2)
@@ -33104,7 +33302,17 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
33104
33302
33105
33303
case GT_XOR:
33106
33304
{
33107
- // Handle `x | 0 == x` and `0 | x == x`
33305
+ if (varTypeIsMask(retType))
33306
+ {
33307
+ // Handle `x ^ 0 == x` and `0 ^ x == x`
33308
+ if (cnsNode->IsMaskZero())
33309
+ {
33310
+ resultNode = otherNode;
33311
+ }
33312
+ break;
33313
+ }
33314
+
33315
+ // Handle `x ^ 0 == x` and `0 ^ x == x`
33108
33316
if (cnsNode->IsVectorZero())
33109
33317
{
33110
33318
resultNode = otherNode;
@@ -33273,7 +33481,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
33273
33481
}
33274
33482
else
33275
33483
{
33276
- assert(!op1->IsTrueMask(simdBaseType) && !op1->IsFalseMask ());
33484
+ assert(!op1->IsTrueMask(simdBaseType) && !op1->IsMaskZero ());
33277
33485
}
33278
33486
#endif
33279
33487
@@ -33291,7 +33499,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
33291
33499
return op2;
33292
33500
}
33293
33501
33294
- if (op1->IsVectorZero() || op1->IsFalseMask ())
33502
+ if (op1->IsVectorZero() || op1->IsMaskZero ())
33295
33503
{
33296
33504
return gtWrapWithSideEffects(op3, op2, GTF_ALL_EFFECT);
33297
33505
}
0 commit comments