@@ -32112,10 +32112,8 @@ bool GenTree::CanDivOrModPossiblyOverflow(Compiler* comp) const
32112
32112
#if defined(FEATURE_HW_INTRINSICS)
32113
32113
GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
32114
32114
{
32115
- if (!opts.Tier0OptimizationEnabled())
32116
- {
32117
- return tree;
32118
- }
32115
+ assert(!optValnumCSE_phase);
32116
+ assert(opts.Tier0OptimizationEnabled());
32119
32117
32120
32118
NamedIntrinsic ni = tree->GetHWIntrinsicId();
32121
32119
var_types retType = tree->TypeGet();
@@ -32254,6 +32252,126 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
32254
32252
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
32255
32253
assert(oper != GT_AND_NOT);
32256
32254
32255
+ #if defined(FEATURE_MASKED_HW_INTRINSICS) && defined(TARGET_XARCH)
32256
+ if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper))
32257
+ {
32258
+ // Comparisons that produce masks lead to more verbose trees than
32259
+ // necessary in many scenarios due to requiring a CvtMaskToVector
32260
+ // node to be inserted over them and this can block various opts
32261
+ // that are dependent on tree height and similar. So we want to
32262
+ // fold the unnecessary back and forth conversions away where possible.
32263
+
32264
+ genTreeOps effectiveOper = oper;
32265
+ GenTree* actualOp2 = op2;
32266
+
32267
+ if (oper == GT_NOT)
32268
+ {
32269
+ assert(op2 == nullptr);
32270
+ op2 = op1;
32271
+ }
32272
+
32273
+ // We need both operands to be ConvertMaskToVector in
32274
+ // order to optimize this to a direct mask operation
32275
+
32276
+ if (op1->OperIsConvertMaskToVector())
32277
+ {
32278
+ if (!op2->OperIsHWIntrinsic())
32279
+ {
32280
+ if ((oper == GT_XOR) && op2->IsVectorAllBitsSet())
32281
+ {
32282
+ // We want to explicitly recognize op1 ^ AllBitsSet as
32283
+ // some platforms don't have direct support for ~op1
32284
+
32285
+ effectiveOper = GT_NOT;
32286
+ op2 = op1;
32287
+ }
32288
+ }
32289
+
32290
+ if (op2->OperIsConvertMaskToVector())
32291
+ {
32292
+ GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic();
32293
+ GenTreeHWIntrinsic* cvtOp2 = op2->AsHWIntrinsic();
32294
+
32295
+ unsigned simdBaseTypeSize = genTypeSize(simdBaseType);
32296
+
32297
+ if ((genTypeSize(cvtOp1->GetSimdBaseType()) == simdBaseTypeSize) &&
32298
+ (genTypeSize(cvtOp2->GetSimdBaseType()) == simdBaseTypeSize))
32299
+ {
32300
+ // We need both operands to be the same kind of mask; otherwise
32301
+ // the bitwise operation can differ in how it performs
32302
+
32303
+ NamedIntrinsic maskIntrinsicId = NI_Illegal;
32304
+
32305
+ switch (effectiveOper)
32306
+ {
32307
+ case GT_AND:
32308
+ {
32309
+ maskIntrinsicId = NI_AVX512_AndMask;
32310
+ break;
32311
+ }
32312
+
32313
+ case GT_NOT:
32314
+ {
32315
+ maskIntrinsicId = NI_AVX512_NotMask;
32316
+ break;
32317
+ }
32318
+
32319
+ case GT_OR:
32320
+ {
32321
+ maskIntrinsicId = NI_AVX512_OrMask;
32322
+ break;
32323
+ }
32324
+
32325
+ case GT_XOR:
32326
+ {
32327
+ maskIntrinsicId = NI_AVX512_XorMask;
32328
+ break;
32329
+ }
32330
+
32331
+ default:
32332
+ {
32333
+ unreached();
32334
+ }
32335
+ }
32336
+
32337
+ assert(maskIntrinsicId != NI_Illegal);
32338
+
32339
+ if (effectiveOper == oper)
32340
+ {
32341
+ tree->ChangeHWIntrinsicId(maskIntrinsicId);
32342
+ tree->Op(1) = cvtOp1->Op(1);
32343
+ }
32344
+ else
32345
+ {
32346
+ assert(effectiveOper == GT_NOT);
32347
+ tree->ResetHWIntrinsicId(maskIntrinsicId, this, cvtOp1->Op(1));
32348
+ tree->gtFlags &= ~GTF_REVERSE_OPS;
32349
+ }
32350
+
32351
+ tree->gtType = TYP_MASK;
32352
+ DEBUG_DESTROY_NODE(op1);
32353
+
32354
+ if (effectiveOper != GT_NOT)
32355
+ {
32356
+ tree->Op(2) = cvtOp2->Op(1);
32357
+ }
32358
+
32359
+ if (actualOp2 != nullptr)
32360
+ {
32361
+ DEBUG_DESTROY_NODE(actualOp2);
32362
+ }
32363
+ tree->SetMorphed(this);
32364
+
32365
+ tree = gtNewSimdCvtMaskToVectorNode(retType, tree, simdBaseJitType, simdSize)->AsHWIntrinsic();
32366
+ tree->SetMorphed(this);
32367
+
32368
+ return tree;
32369
+ }
32370
+ }
32371
+ }
32372
+ }
32373
+ #endif // FEATURE_MASKED_HW_INTRINSICS && TARGET_XARCH
32374
+
32257
32375
switch (ni)
32258
32376
{
32259
32377
// There's certain IR simplifications that are possible and which
@@ -32830,10 +32948,28 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
32830
32948
oper = GT_NONE;
32831
32949
}
32832
32950
32951
+ // For mask nodes in particular, the foldings below are done under the presumption
32952
+ // that we only produce something like `AddMask(op1, op2)` if op1 and op2 are compatible
32953
+ // masks. On xarch, for example, this means that it'd be adding 8, 16, 32, or 64-bits
32954
+ // together with the same size. We wouldn't ever encounter something like an 8 and 16 bit
32955
+ // masks being added. This ensures that we don't end up with a case where folding would
32956
+ // cause a different result to be produced, such as because the remaining upper bits are
32957
+ // no longer zeroed.
32958
+
32833
32959
switch (oper)
32834
32960
{
32835
32961
case GT_ADD:
32836
32962
{
32963
+ if (varTypeIsMask(retType))
32964
+ {
32965
+ // Handle `x + 0 == x` and `0 + x == x`
32966
+ if (cnsNode->IsMaskZero())
32967
+ {
32968
+ resultNode = otherNode;
32969
+ }
32970
+ break;
32971
+ }
32972
+
32837
32973
if (varTypeIsFloating(simdBaseType))
32838
32974
{
32839
32975
// Handle `x + NaN == NaN` and `NaN + x == NaN`
@@ -32867,6 +33003,23 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
32867
33003
32868
33004
case GT_AND:
32869
33005
{
33006
+ if (varTypeIsMask(retType))
33007
+ {
33008
+ // Handle `x & 0 == 0` and `0 & x == 0`
33009
+ if (cnsNode->IsMaskZero())
33010
+ {
33011
+ resultNode = otherNode;
33012
+ break;
33013
+ }
33014
+
33015
+ // Handle `x & AllBitsSet == x` and `AllBitsSet & x == x`
33016
+ if (cnsNode->IsMaskAllBitsSet())
33017
+ {
33018
+ resultNode = otherNode;
33019
+ }
33020
+ break;
33021
+ }
33022
+
32870
33023
// Handle `x & 0 == 0` and `0 & x == 0`
32871
33024
if (cnsNode->IsVectorZero())
32872
33025
{
@@ -33100,6 +33253,23 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
33100
33253
33101
33254
case GT_OR:
33102
33255
{
33256
+ if (varTypeIsMask(retType))
33257
+ {
33258
+ // Handle `x | 0 == x` and `0 | x == x`
33259
+ if (cnsNode->IsMaskZero())
33260
+ {
33261
+ resultNode = otherNode;
33262
+ break;
33263
+ }
33264
+
33265
+ // Handle `x | AllBitsSet == AllBitsSet` and `AllBitsSet | x == AllBitsSet`
33266
+ if (cnsNode->IsMaskAllBitsSet())
33267
+ {
33268
+ resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
33269
+ }
33270
+ break;
33271
+ }
33272
+
33103
33273
// Handle `x | 0 == x` and `0 | x == x`
33104
33274
if (cnsNode->IsVectorZero())
33105
33275
{
@@ -33127,6 +33297,27 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
33127
33297
// Handle `x >> 0 == x` and `0 >> x == 0`
33128
33298
// Handle `x >>> 0 == x` and `0 >>> x == 0`
33129
33299
33300
+ if (varTypeIsMask(retType))
33301
+ {
33302
+ if (cnsNode->IsMaskZero())
33303
+ {
33304
+ if (cnsNode == op2)
33305
+ {
33306
+ resultNode = otherNode;
33307
+ }
33308
+ else
33309
+ {
33310
+ resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
33311
+ }
33312
+ }
33313
+ else if (cnsNode->IsIntegralConst(0))
33314
+ {
33315
+ assert(cnsNode == op2);
33316
+ resultNode = otherNode;
33317
+ }
33318
+ break;
33319
+ }
33320
+
33130
33321
if (cnsNode->IsVectorZero())
33131
33322
{
33132
33323
if (cnsNode == op2)
@@ -33172,7 +33363,17 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
33172
33363
33173
33364
case GT_XOR:
33174
33365
{
33175
- // Handle `x | 0 == x` and `0 | x == x`
33366
+ if (varTypeIsMask(retType))
33367
+ {
33368
+ // Handle `x ^ 0 == x` and `0 ^ x == x`
33369
+ if (cnsNode->IsMaskZero())
33370
+ {
33371
+ resultNode = otherNode;
33372
+ }
33373
+ break;
33374
+ }
33375
+
33376
+ // Handle `x ^ 0 == x` and `0 ^ x == x`
33176
33377
if (cnsNode->IsVectorZero())
33177
33378
{
33178
33379
resultNode = otherNode;
@@ -33341,7 +33542,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
33341
33542
}
33342
33543
else
33343
33544
{
33344
- assert(!op1->IsTrueMask(simdBaseType) && !op1->IsFalseMask ());
33545
+ assert(!op1->IsTrueMask(simdBaseType) && !op1->IsMaskZero ());
33345
33546
}
33346
33547
#endif
33347
33548
@@ -33359,7 +33560,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
33359
33560
return op2;
33360
33561
}
33361
33562
33362
- if (op1->IsVectorZero() || op1->IsFalseMask ())
33563
+ if (op1->IsVectorZero() || op1->IsMaskZero ())
33363
33564
{
33364
33565
return gtWrapWithSideEffects(op3, op2, GTF_ALL_EFFECT);
33365
33566
}
0 commit comments