From 8a23510c64d6b35c9a2b3eb183f2526dd336aac9 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 11 Apr 2025 12:19:06 +0100 Subject: [PATCH 01/62] Arm64 SVE: Better optimise zero/allbits vectors Fixes #114443 * IsVectorZero() should allow for all zero vectors and false masks that have been converted to vectors. * IsVectorAllBitsSet() should allow for all bits set vectors and true masks that have been converted to vectors. * IsMaskZero() should all for false masks and all zero vectors that have been converted to masks. * IsMaskAllBitsSet() should allow for true masks and all bit set vectors that have been converted to masks. In addition: * Fix up all the errors caused by these changes. * Add a bunch of asmcheck tests --- src/coreclr/jit/gentree.cpp | 35 +- src/coreclr/jit/gentree.h | 129 ++++-- src/coreclr/jit/lowerarmarch.cpp | 22 +- .../JIT/opt/MaskConversions/ConstantMasks.cs | 253 ++++++++++++ .../opt/MaskConversions/ConstantMasks.csproj | 19 + .../MaskConversions/ConstantMasksOp2Fixed.cs | 366 ++++++++++++++++++ .../ConstantMasksOp2Fixed.csproj | 19 + 7 files changed, 800 insertions(+), 43 deletions(-) create mode 100644 src/tests/JIT/opt/MaskConversions/ConstantMasks.cs create mode 100644 src/tests/JIT/opt/MaskConversions/ConstantMasks.csproj create mode 100644 src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs create mode 100644 src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.csproj diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 6a062b02f2b12d..04bad868bb76ce 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20437,7 +20437,8 @@ bool GenTree::isContainableHWIntrinsic() const } } #elif defined(TARGET_ARM64) - return (AsHWIntrinsic()->GetHWIntrinsicId() == NI_Sve_ConditionalSelect); + return (AsHWIntrinsic()->GetHWIntrinsicId() == NI_Sve_ConditionalSelect || IsVectorAllBitsSet() || + IsMaskAllBitsSet() || IsVectorZero() || IsMaskZero()); #else return false; #endif // TARGET_XARCH @@ -28131,6 +28132,38 @@ bool GenTree::OperIsVectorConditionalSelect() const return false; } +//------------------------------------------------------------------------ +// OperIsFalseMask: Is this a vector CreateFalseMask hwintrinsic +// +// Return Value: +// true if the node is a vector CreateFalseMask hwintrinsic +// otherwise; false +// +bool GenTree::OperIsFalseMask() const +{ + if (OperIsHWIntrinsic()) + { + return AsHWIntrinsic()->OperIsFalseMask(); + } + return false; +} + +//------------------------------------------------------------------------ +// OperIsTrueMask: Is this a vector CreateTrueMask hwintrinsic +// +// Return Value: +// true if the node is a vector CreateTrueMask hwintrinsic +// otherwise; false +// +bool GenTree::OperIsTrueMask() const +{ + if (OperIsHWIntrinsic()) + { + return AsHWIntrinsic()->OperIsTrueMask(); + } + return false; +} + //------------------------------------------------------------------------ // OperIsMemoryLoad: Does this HWI node have memory load semantics? // diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index f3fb94b09429e8..cc7c97f4cd0c64 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1688,6 +1688,8 @@ struct GenTree bool OperIsConvertMaskToVector() const; bool OperIsConvertVectorToMask() const; bool OperIsVectorConditionalSelect() const; + bool OperIsFalseMask() const; + bool OperIsTrueMask() const; // This is here for cleaner GT_LONG #ifdefs. static bool OperIsLong(genTreeOps gtOper) @@ -6477,6 +6479,37 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic #endif } + bool OperIsFalseMask() const + { +#if defined(TARGET_ARM64) + static_assert_no_msg(AreContiguous(NI_Sve_CreateFalseMaskByte, NI_Sve_CreateFalseMaskDouble, + NI_Sve_CreateFalseMaskInt16, NI_Sve_CreateFalseMaskInt32, + NI_Sve_CreateFalseMaskInt64, NI_Sve_CreateFalseMaskSByte, + NI_Sve_CreateFalseMaskSingle, NI_Sve_CreateFalseMaskUInt16, + NI_Sve_CreateFalseMaskUInt32, NI_Sve_CreateFalseMaskUInt64)); + + NamedIntrinsic id = GetHWIntrinsicId(); + return ((id >= NI_Sve_CreateFalseMaskByte) && (id <= NI_Sve_CreateFalseMaskUInt64)); +#endif + return false; + } + + bool OperIsTrueMask() const + { +#if defined(TARGET_ARM64) + static_assert_no_msg(AreContiguous(NI_Sve_CreateTrueMaskByte, NI_Sve_CreateTrueMaskDouble, + NI_Sve_CreateTrueMaskInt16, NI_Sve_CreateTrueMaskInt32, + NI_Sve_CreateTrueMaskInt64, NI_Sve_CreateTrueMaskSByte, + NI_Sve_CreateTrueMaskSingle, NI_Sve_CreateTrueMaskUInt16, + NI_Sve_CreateTrueMaskUInt32, NI_Sve_CreateTrueMaskUInt64)); + + NamedIntrinsic id = GetHWIntrinsicId(); + return ((id == NI_Sve_CreateTrueMaskAll) || + ((id >= NI_Sve_CreateTrueMaskByte) && (id <= NI_Sve_CreateTrueMaskUInt64))); +#endif + return false; + } + bool OperRequiresAsgFlag() const; bool OperRequiresCallFlag() const; bool OperRequiresGlobRefFlag() const; @@ -9592,10 +9625,21 @@ inline bool GenTree::IsFloatPositiveZero() const inline bool GenTree::IsVectorZero() const { #if defined(FEATURE_SIMD) - return IsCnsVec() && AsVecCon()->IsZero(); -#else - return false; + if (IsCnsVec() && AsVecCon()->IsZero()) + { + return true; + } + +#if defined(TARGET_ARM64) + // Can also be an all false mask that has been converted to a vector. + if (OperIsConvertMaskToVector() && AsHWIntrinsic()->Op(1)->OperIsFalseMask()) + { + return true; + } +#endif // TARGET_ARM64 #endif // FEATURE_SIMD + + return false; } //------------------------------------------------------------------- @@ -9618,7 +9662,7 @@ inline bool GenTree::IsVectorNegativeZero(var_types simdBaseType) const } //------------------------------------------------------------------- -// IsVectorZero: returns true if this node is a vector constant with all bits zero. +// IsVectorNaN: returns true if this node is a vector constant with all bits zero. // // Arguments: // simdBaseType - the base type of the constant being checked @@ -9681,6 +9725,14 @@ inline bool GenTree::IsVectorAllBitsSet() const { return AsVecCon()->IsAllBitsSet(); } + +#if defined(TARGET_ARM64) + // Can also be an all true mask that has been converted to a vector. + if (OperIsConvertMaskToVector() && AsHWIntrinsic()->Op(1)->OperIsTrueMask()) + { + return true; + } +#endif // TARGET_ARM64 #endif // FEATURE_SIMD return false; @@ -9704,54 +9756,65 @@ inline bool GenTree::IsVectorBroadcast(var_types simdBaseType) const return false; } +//------------------------------------------------------------------- +// IsMaskAllBitsSet: returns true if this node is a mask constant with all bits set. +// +// Returns: +// True if this node is a mask constant with all bits set +// inline bool GenTree::IsMaskAllBitsSet() const { -#ifdef TARGET_ARM64 - static_assert_no_msg(AreContiguous(NI_Sve_CreateTrueMaskByte, NI_Sve_CreateTrueMaskDouble, - NI_Sve_CreateTrueMaskInt16, NI_Sve_CreateTrueMaskInt32, - NI_Sve_CreateTrueMaskInt64, NI_Sve_CreateTrueMaskSByte, - NI_Sve_CreateTrueMaskSingle, NI_Sve_CreateTrueMaskUInt16, - NI_Sve_CreateTrueMaskUInt32, NI_Sve_CreateTrueMaskUInt64)); +#if defined(TARGET_ARM64) - if (OperIsHWIntrinsic()) + if (OperIsTrueMask()) { - NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); - if (id == NI_Sve_ConvertMaskToVector) + return true; + } + + // Can also be an all bits set vector that has been converted to a mask. + if (OperIsConvertVectorToMask()) + { + assert(AsHWIntrinsic()->Op(1)->OperIsTrueMask()); + + GenTree* op2 = AsHWIntrinsic()->Op(2); + if (op2->IsCnsVec() && op2->AsVecCon()->IsAllBitsSet()) { - GenTree* op1 = AsHWIntrinsic()->Op(1); - assert(op1->OperIsHWIntrinsic()); - id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); + return true; } - return ((id == NI_Sve_CreateTrueMaskAll) || - ((id >= NI_Sve_CreateTrueMaskByte) && (id <= NI_Sve_CreateTrueMaskUInt64))); } -#endif +#endif // TARGET_ARM64 return false; } +//------------------------------------------------------------------- +// IsMaskZero: returns true if this node is a mask constant with all bits zero. +// +// Returns: +// True if this node is a mask constant with all bits zero +// inline bool GenTree::IsMaskZero() const { -#ifdef TARGET_ARM64 - static_assert_no_msg(AreContiguous(NI_Sve_CreateFalseMaskByte, NI_Sve_CreateFalseMaskDouble, - NI_Sve_CreateFalseMaskInt16, NI_Sve_CreateFalseMaskInt32, - NI_Sve_CreateFalseMaskInt64, NI_Sve_CreateFalseMaskSByte, - NI_Sve_CreateFalseMaskSingle, NI_Sve_CreateFalseMaskUInt16, - NI_Sve_CreateFalseMaskUInt32, NI_Sve_CreateFalseMaskUInt64)); +#if defined(TARGET_ARM64) - if (OperIsHWIntrinsic()) + if (OperIsFalseMask()) { - NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); - if (id == NI_Sve_ConvertMaskToVector) + return true; + } + + // Can also be an all zero vector that has been converted to a mask. + if (OperIsConvertVectorToMask()) + { + assert(AsHWIntrinsic()->Op(1)->OperIsTrueMask()); + + GenTree* op2 = AsHWIntrinsic()->Op(2); + if (op2->IsCnsVec() && op2->AsVecCon()->IsZero()) { - GenTree* op1 = AsHWIntrinsic()->Op(1); - assert(op1->OperIsHWIntrinsic()); - id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); + return true; } - return ((id >= NI_Sve_CreateFalseMaskByte) && (id <= NI_Sve_CreateFalseMaskUInt64)); } -#endif +#endif // TARGET_ARM64 return false; } diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 019e1115f9bb13..38d8be7fb203cc 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -3959,14 +3959,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) GenTree* op2 = intrin.op2; GenTree* op3 = intrin.op3; - // Handle op1 - if (op1->IsVectorZero()) - { - // When we are merging with zero, we can specialize - // and avoid instantiating the vector constant. - MakeSrcContained(node, op1); - } - // Handle op2 if (op2->OperIsHWIntrinsic() && !op2->IsEmbMaskOp()) { @@ -3982,6 +3974,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // the operation MakeSrcContained(node, op2); op2->MakeEmbMaskOp(); + JITDUMP("Containing op2 inside ConditionalSelect\n"); + DISPTREERANGE(BlockRange(), op2); } else { @@ -4002,6 +3996,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { MakeSrcContained(node, op2); op2->MakeEmbMaskOp(); + JITDUMP("Containing convert op2 inside ConditionalSelect\n"); + DISPTREERANGE(BlockRange(), op2); } } } @@ -4014,17 +4010,25 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (embOp->Op(2)->IsCnsIntOrI()) { MakeSrcContained(op2, embOp->Op(2)); + JITDUMP("Containing ShiftRight op2 inside ConditionalSelect\n"); + DISPTREERANGE(BlockRange(), op2); } } } // Handle op3 - if (op3->IsVectorZero() && op1->IsMaskAllBitsSet()) + if (op3->IsVectorZero() && op1->IsMaskAllBitsSet() && op2->IsEmbMaskOp()) { // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. // Do this only if op1 was AllTrueMask MakeSrcContained(node, op3); + if (op3->OperIsConvertMaskToVector()) + { + MakeSrcContained(node, op3->AsHWIntrinsic()->Op(1)); + } + JITDUMP("Containing all true op3 inside ConditionalSelect\n"); + DISPTREERANGE(BlockRange(), op3); } break; diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs b/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs new file mode 100644 index 00000000000000..a23bba250d0d2b --- /dev/null +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs @@ -0,0 +1,253 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +// Unit tests for the masks conversion optimization +// Uses vectors as masks and vice versa. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Threading; +using Xunit; + +public class ConstantMasks +{ + [MethodImpl(MethodImplOptions.NoInlining)] + private static void Consume(T value) { } + + [Fact] + public static void TestEntryPoint() + { + if (Sve.IsSupported) + { + Vector op1 = Vector.Create(11); + Vector op2 = Vector.Create(22); + Vector op3 = Vector.Create(33); + Vector opl1 = Vector.Create(44); + Vector opl2 = Vector.Create(55); + + CndSelectEmbedded(op1, op2, op3); + CndSelectEmbeddedFalseMask(op1, op2); + CndSelectEmbeddedZero(op1, op2); + CndSelectEmbeddedTrueMask(op1, op2); + CndSelectEmbeddedAllBits(op1, op2); + + CndSelectOptionalEmbedded(op1, op2, op3); + CndSelectOptionalEmbeddedFalseMask(op1, op2); + CndSelectOptionalEmbeddedZero(op1, op2); + CndSelectOptionalEmbeddedTrueMask(op1, op2); + CndSelectOptionalEmbeddedAllBits(op1, op2); + + CndSelectEmbeddedOneOp(op1, op2); + CndSelectEmbeddedOneOpFalseMask(op1); + CndSelectEmbeddedOneOpZero(op1); + CndSelectEmbeddedOneOpTrueMask(op1); + CndSelectEmbeddedOneOpAllBits(op1); + + CndSelectEmbeddedReduction(opl1, op2, opl2); + CndSelectEmbeddedReductionFalseMask(op1, opl1); + CndSelectEmbeddedReductionZero(op1, opl1); + CndSelectEmbeddedReductionTrueMask(op1, opl1); + CndSelectEmbeddedReductionAllBits(op1, opl1); + } + } + + // SVE operation (with embedded mask) inside a conditional select + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbedded(Vector mask, Vector op1, Vector op2) { + //ARM64-FULL-LINE: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(mask, Sve.AbsoluteDifference(op1, op2), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedFalseMask(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.AbsoluteDifference(op1, op2), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedZero(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AbsoluteDifference(op1, op2), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedTrueMask(Vector op1, Vector op2) { + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.AbsoluteDifference(op1, op2), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedAllBits(Vector op1, Vector op2) { + //ARM64-FULL-LINE: mvni {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AbsoluteDifference(op1, op2), op1); + Consume(result); + } + + + // SVE operation (with optional embedded mask) inside a conditional select + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbedded(Vector mask, Vector op1, Vector op2) { + //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(mask, Sve.Add(op1, op2), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedFalseMask(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Add(op1, op2), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedZero(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.Add(op1, op2), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedTrueMask(Vector op1, Vector op2) { + //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.Add(op1, op2), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedAllBits(Vector op1, Vector op2) { + //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.Add(op1, op2), op1); + Consume(result); + } + + + // SVE one op operation (with embedded mask) inside a conditional select + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedOneOp(Vector mask, Vector op1) { + //ARM64-FULL-LINE: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(mask, Sve.Abs(op1), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedOneOpFalseMask(Vector op1) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Abs(op1), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedOneOpZero(Vector op1) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE-NEXT: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.Abs(op1), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedOneOpTrueMask(Vector op1) { + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.Abs(op1), op1); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedOneOpAllBits(Vector op1) { + //ARM64-FULL-LINE: mvni {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE-NEXT: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.Abs(op1), op1); + Consume(result); + } + + + // SVE reduction operation (with embedded mask) inside a conditional select. + // The op and conditional select cannot be combined into one instruction. + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReduction(Vector mask, Vector op1, Vector opf) { + //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(mask, Sve.AddAcross(op1), opf); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionFalseMask(Vector op1, Vector opf) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt64(), Sve.AddAcross(op1), opf); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionZero(Vector op1, Vector opf) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AddAcross(op1), opf); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionTrueMask(Vector op1, Vector opf) { + //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt64(), Sve.AddAcross(op1), opf); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionAllBits(Vector op1, Vector opf) { + //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AddAcross(op1), opf); + Consume(result); + } + +} diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasks.csproj b/src/tests/JIT/opt/MaskConversions/ConstantMasks.csproj new file mode 100644 index 00000000000000..ed531920304c5e --- /dev/null +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasks.csproj @@ -0,0 +1,19 @@ + + + + true + + + None + True + $(NoWarn),SYSLIB5003 + + + + true + + + + + + diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs new file mode 100644 index 00000000000000..6af931b072bb39 --- /dev/null +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs @@ -0,0 +1,366 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +// Unit tests for the masks conversion optimization +// Uses vectors as masks and vice versa. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Threading; +using Xunit; + +public class ConstantMasks +{ + [MethodImpl(MethodImplOptions.NoInlining)] + private static void Consume(T value) { } + + [Fact] + public static void TestEntryPoint() + { + if (Sve.IsSupported) + { + Vector op1 = Vector.Create(11); + Vector op2 = Vector.Create(22); + Vector op3 = Vector.Create(33); + Vector opl1 = Vector.Create(44); + Vector opl2 = Vector.Create(55); + + CndSelectEmbeddedF(op1, op2, op3); + CndSelectEmbeddedZ(op1, op2, op3); + CndSelectEmbeddedFalseMaskF(op1, op2); + CndSelectEmbeddedFalseMaskZ(op1, op2); + CndSelectEmbeddedZeroF(op1, op2); + CndSelectEmbeddedZeroZ(op1, op2); + CndSelectEmbeddedTrueMaskF(op1, op2); + CndSelectEmbeddedTrueMaskZ(op1, op2); + CndSelectEmbeddedAllBitsF(op1, op2); + CndSelectEmbeddedAllBitsZ(op1, op2); + + CndSelectOptionalEmbeddedF(op1, op2, op3); + CndSelectOptionalEmbeddedZ(op1, op2, op3); + CndSelectOptionalEmbeddedFalseMaskF(op1, op2); + CndSelectOptionalEmbeddedFalseMaskZ(op1, op2); + CndSelectOptionalEmbeddedZeroF(op1, op2); + CndSelectOptionalEmbeddedZeroZ(op1, op2); + CndSelectOptionalEmbeddedTrueMaskF(op1, op2); + CndSelectOptionalEmbeddedTrueMaskZ(op1, op2); + CndSelectOptionalEmbeddedAllBitsF(op1, op2); + CndSelectOptionalEmbeddedAllBitsZ(op1, op2); + + // CndSelectEmbeddedOneOp(op1, op2); + // CndSelectEmbeddedOneOpFalseMask(op1); + // CndSelectEmbeddedOneOpZero(op1); + // CndSelectEmbeddedOneOpTrueMask(op1); + // CndSelectEmbeddedOneOpAllBits(op1); + + CndSelectEmbeddedReductionF(opl1, op2); + CndSelectEmbeddedReductionZ(opl1, op2); + CndSelectEmbeddedReductionFalseMaskF(op1); + CndSelectEmbeddedReductionFalseMaskZ(op1); + CndSelectEmbeddedReductionZeroF(op1); + CndSelectEmbeddedReductionZeroZ(op1); + CndSelectEmbeddedReductionTrueMaskF(op1); + CndSelectEmbeddedReductionTrueMaskZ(op1); + CndSelectEmbeddedReductionAllBitsF(op1); + CndSelectEmbeddedReductionAllBitsZ(op1); + } + } + + // SVE operation (with embedded mask) inside a conditional select + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedF(Vector mask, Vector op1, Vector op2) { + //ARM6-FULL-LINE: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(mask, Sve.AbsoluteDifference(op1, op2), Sve.CreateFalseMaskInt32()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedZ(Vector mask, Vector op1, Vector op2) { + //ARM6-FULL-LINE: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(mask, Sve.AbsoluteDifference(op1, op2), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedFalseMaskF(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM6-FULL-LINE: pfalse {{p[0-9]+}}.b + //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.AbsoluteDifference(op1, op2), Sve.CreateFalseMaskInt32()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedFalseMaskZ(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM6-FULL-LINE: pfalse {{p[0-9]+}}.b + //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.AbsoluteDifference(op1, op2), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedZeroF(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 + //ARM6-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Vector.Zero, Sve.AbsoluteDifference(op1, op2), Sve.CreateFalseMaskInt32()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedZeroZ(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 + //ARM6-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Vector.Zero, Sve.AbsoluteDifference(op1, op2), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedTrueMaskF(Vector op1, Vector op2) { + //ARM6-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.AbsoluteDifference(op1, op2), Sve.CreateFalseMaskInt32()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedTrueMaskZ(Vector op1, Vector op2) { + //ARM6-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM6-FULL-LINE-NEXT: movprfx {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.AbsoluteDifference(op1, op2), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedAllBitsF(Vector op1, Vector op2) { + //ARM6-FULL-LINE: mvni {{v[0-9]+}}.4s, #0 + //ARM6-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM6-FULL-LINE-NEXT: movprfx {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AbsoluteDifference(op1, op2), Sve.CreateFalseMaskInt32()); + Consume(result); + } + + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedAllBitsZ(Vector op1, Vector op2) { + //ARM6-FULL-LINE: mvni {{v[0-9]+}}.4s, #0 + //ARM6-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AbsoluteDifference(op1, op2), Vector.Zero); + Consume(result); + } + + // SVE one op operation (with embedded mask) inside a conditional select + +///...... + + // SVE operation (with optional embedded mask) inside a conditional select + + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedF(Vector mask, Vector op1, Vector op2) { + //ARM6-FULL-LINE: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(mask, Sve.Add(op1, op2), Sve.CreateFalseMaskInt32()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedZ(Vector mask, Vector op1, Vector op2) { + //ARM6-FULL-LINE: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(mask, Sve.Add(op1, op2), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedFalseMaskF(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM6-FULL-LINE: pfalse {{p[0-9]+}}.b + //ARM6-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Add(op1, op2), Sve.CreateFalseMaskInt32()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedFalseMaskZ(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM6-FULL-LINE: pfalse {{p[0-9]+}}.b + //ARM6-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Add(op1, op2), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedZeroF(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 + //ARM6-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM6-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Vector.Zero, Sve.Add(op1, op2), Sve.CreateFalseMaskInt32()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedZeroZ(Vector op1, Vector op2) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 + //ARM6-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM6-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Vector.Zero, Sve.Add(op1, op2), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedTrueMaskF(Vector op1, Vector op2) { + //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.Add(op1, op2), Sve.CreateFalseMaskInt32()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedTrueMaskZ(Vector op1, Vector op2) { + //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.Add(op1, op2), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedAllBitsF(Vector op1, Vector op2) { + //ARM6-FULL-LINE: add {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.Add(op1, op2), Sve.CreateFalseMaskInt32()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectOptionalEmbeddedAllBitsZ(Vector op1, Vector op2) { + //ARM6-FULL-LINE: add {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + var result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.Add(op1, op2), Vector.Zero); + Consume(result); + } + + // SVE reduction operation (with embedded mask) inside a conditional select. + // The op and conditional select cannot be combined into one instruction. + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionF(Vector mask, Vector op1) { + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(mask, Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionZ(Vector mask, Vector op1) { + //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(mask, Sve.AddAcross(op1), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionFalseMaskF(Vector op1) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt64(), Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionFalseMaskZ(Vector op1) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt64(), Sve.AddAcross(op1), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionZeroF(Vector op1) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionZeroZ(Vector op1) { + //ARMSVE-TODO: This could be optimised to remove both instructions #114433 + //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AddAcross(op1), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionTrueMaskF(Vector op1) { + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt64(), Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionTrueMaskZ(Vector op1) { + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt64(), Sve.AddAcross(op1), Vector.Zero); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionAllBitsF(Vector op1) { + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: pfalse {{p[0-9]+}}.b + //ARM64-FULL-LINE-NEXT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); + Consume(result); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static void CndSelectEmbeddedReductionAllBitsZ(Vector op1) { + //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AddAcross(op1), Vector.Zero); + Consume(result); + } +} diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.csproj b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.csproj new file mode 100644 index 00000000000000..ed531920304c5e --- /dev/null +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.csproj @@ -0,0 +1,19 @@ + + + + true + + + None + True + $(NoWarn),SYSLIB5003 + + + + true + + + + + + From fefb33c9f2fbbe390ef443f8538a00119d929eae Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 19 May 2025 14:10:26 +0100 Subject: [PATCH 02/62] Remove all jit changes --- src/coreclr/jit/gentree.cpp | 35 +-------- src/coreclr/jit/gentree.h | 129 ++++++++----------------------- src/coreclr/jit/lowerarmarch.cpp | 22 +++--- 3 files changed, 43 insertions(+), 143 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 04bad868bb76ce..6a062b02f2b12d 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20437,8 +20437,7 @@ bool GenTree::isContainableHWIntrinsic() const } } #elif defined(TARGET_ARM64) - return (AsHWIntrinsic()->GetHWIntrinsicId() == NI_Sve_ConditionalSelect || IsVectorAllBitsSet() || - IsMaskAllBitsSet() || IsVectorZero() || IsMaskZero()); + return (AsHWIntrinsic()->GetHWIntrinsicId() == NI_Sve_ConditionalSelect); #else return false; #endif // TARGET_XARCH @@ -28132,38 +28131,6 @@ bool GenTree::OperIsVectorConditionalSelect() const return false; } -//------------------------------------------------------------------------ -// OperIsFalseMask: Is this a vector CreateFalseMask hwintrinsic -// -// Return Value: -// true if the node is a vector CreateFalseMask hwintrinsic -// otherwise; false -// -bool GenTree::OperIsFalseMask() const -{ - if (OperIsHWIntrinsic()) - { - return AsHWIntrinsic()->OperIsFalseMask(); - } - return false; -} - -//------------------------------------------------------------------------ -// OperIsTrueMask: Is this a vector CreateTrueMask hwintrinsic -// -// Return Value: -// true if the node is a vector CreateTrueMask hwintrinsic -// otherwise; false -// -bool GenTree::OperIsTrueMask() const -{ - if (OperIsHWIntrinsic()) - { - return AsHWIntrinsic()->OperIsTrueMask(); - } - return false; -} - //------------------------------------------------------------------------ // OperIsMemoryLoad: Does this HWI node have memory load semantics? // diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index cc7c97f4cd0c64..f3fb94b09429e8 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1688,8 +1688,6 @@ struct GenTree bool OperIsConvertMaskToVector() const; bool OperIsConvertVectorToMask() const; bool OperIsVectorConditionalSelect() const; - bool OperIsFalseMask() const; - bool OperIsTrueMask() const; // This is here for cleaner GT_LONG #ifdefs. static bool OperIsLong(genTreeOps gtOper) @@ -6479,37 +6477,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic #endif } - bool OperIsFalseMask() const - { -#if defined(TARGET_ARM64) - static_assert_no_msg(AreContiguous(NI_Sve_CreateFalseMaskByte, NI_Sve_CreateFalseMaskDouble, - NI_Sve_CreateFalseMaskInt16, NI_Sve_CreateFalseMaskInt32, - NI_Sve_CreateFalseMaskInt64, NI_Sve_CreateFalseMaskSByte, - NI_Sve_CreateFalseMaskSingle, NI_Sve_CreateFalseMaskUInt16, - NI_Sve_CreateFalseMaskUInt32, NI_Sve_CreateFalseMaskUInt64)); - - NamedIntrinsic id = GetHWIntrinsicId(); - return ((id >= NI_Sve_CreateFalseMaskByte) && (id <= NI_Sve_CreateFalseMaskUInt64)); -#endif - return false; - } - - bool OperIsTrueMask() const - { -#if defined(TARGET_ARM64) - static_assert_no_msg(AreContiguous(NI_Sve_CreateTrueMaskByte, NI_Sve_CreateTrueMaskDouble, - NI_Sve_CreateTrueMaskInt16, NI_Sve_CreateTrueMaskInt32, - NI_Sve_CreateTrueMaskInt64, NI_Sve_CreateTrueMaskSByte, - NI_Sve_CreateTrueMaskSingle, NI_Sve_CreateTrueMaskUInt16, - NI_Sve_CreateTrueMaskUInt32, NI_Sve_CreateTrueMaskUInt64)); - - NamedIntrinsic id = GetHWIntrinsicId(); - return ((id == NI_Sve_CreateTrueMaskAll) || - ((id >= NI_Sve_CreateTrueMaskByte) && (id <= NI_Sve_CreateTrueMaskUInt64))); -#endif - return false; - } - bool OperRequiresAsgFlag() const; bool OperRequiresCallFlag() const; bool OperRequiresGlobRefFlag() const; @@ -9625,21 +9592,10 @@ inline bool GenTree::IsFloatPositiveZero() const inline bool GenTree::IsVectorZero() const { #if defined(FEATURE_SIMD) - if (IsCnsVec() && AsVecCon()->IsZero()) - { - return true; - } - -#if defined(TARGET_ARM64) - // Can also be an all false mask that has been converted to a vector. - if (OperIsConvertMaskToVector() && AsHWIntrinsic()->Op(1)->OperIsFalseMask()) - { - return true; - } -#endif // TARGET_ARM64 -#endif // FEATURE_SIMD - + return IsCnsVec() && AsVecCon()->IsZero(); +#else return false; +#endif // FEATURE_SIMD } //------------------------------------------------------------------- @@ -9662,7 +9618,7 @@ inline bool GenTree::IsVectorNegativeZero(var_types simdBaseType) const } //------------------------------------------------------------------- -// IsVectorNaN: returns true if this node is a vector constant with all bits zero. +// IsVectorZero: returns true if this node is a vector constant with all bits zero. // // Arguments: // simdBaseType - the base type of the constant being checked @@ -9725,14 +9681,6 @@ inline bool GenTree::IsVectorAllBitsSet() const { return AsVecCon()->IsAllBitsSet(); } - -#if defined(TARGET_ARM64) - // Can also be an all true mask that has been converted to a vector. - if (OperIsConvertMaskToVector() && AsHWIntrinsic()->Op(1)->OperIsTrueMask()) - { - return true; - } -#endif // TARGET_ARM64 #endif // FEATURE_SIMD return false; @@ -9756,65 +9704,54 @@ inline bool GenTree::IsVectorBroadcast(var_types simdBaseType) const return false; } -//------------------------------------------------------------------- -// IsMaskAllBitsSet: returns true if this node is a mask constant with all bits set. -// -// Returns: -// True if this node is a mask constant with all bits set -// inline bool GenTree::IsMaskAllBitsSet() const { -#if defined(TARGET_ARM64) - - if (OperIsTrueMask()) - { - return true; - } +#ifdef TARGET_ARM64 + static_assert_no_msg(AreContiguous(NI_Sve_CreateTrueMaskByte, NI_Sve_CreateTrueMaskDouble, + NI_Sve_CreateTrueMaskInt16, NI_Sve_CreateTrueMaskInt32, + NI_Sve_CreateTrueMaskInt64, NI_Sve_CreateTrueMaskSByte, + NI_Sve_CreateTrueMaskSingle, NI_Sve_CreateTrueMaskUInt16, + NI_Sve_CreateTrueMaskUInt32, NI_Sve_CreateTrueMaskUInt64)); - // Can also be an all bits set vector that has been converted to a mask. - if (OperIsConvertVectorToMask()) + if (OperIsHWIntrinsic()) { - assert(AsHWIntrinsic()->Op(1)->OperIsTrueMask()); - - GenTree* op2 = AsHWIntrinsic()->Op(2); - if (op2->IsCnsVec() && op2->AsVecCon()->IsAllBitsSet()) + NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); + if (id == NI_Sve_ConvertMaskToVector) { - return true; + GenTree* op1 = AsHWIntrinsic()->Op(1); + assert(op1->OperIsHWIntrinsic()); + id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); } + return ((id == NI_Sve_CreateTrueMaskAll) || + ((id >= NI_Sve_CreateTrueMaskByte) && (id <= NI_Sve_CreateTrueMaskUInt64))); } -#endif // TARGET_ARM64 +#endif return false; } -//------------------------------------------------------------------- -// IsMaskZero: returns true if this node is a mask constant with all bits zero. -// -// Returns: -// True if this node is a mask constant with all bits zero -// inline bool GenTree::IsMaskZero() const { -#if defined(TARGET_ARM64) - - if (OperIsFalseMask()) - { - return true; - } +#ifdef TARGET_ARM64 + static_assert_no_msg(AreContiguous(NI_Sve_CreateFalseMaskByte, NI_Sve_CreateFalseMaskDouble, + NI_Sve_CreateFalseMaskInt16, NI_Sve_CreateFalseMaskInt32, + NI_Sve_CreateFalseMaskInt64, NI_Sve_CreateFalseMaskSByte, + NI_Sve_CreateFalseMaskSingle, NI_Sve_CreateFalseMaskUInt16, + NI_Sve_CreateFalseMaskUInt32, NI_Sve_CreateFalseMaskUInt64)); - // Can also be an all zero vector that has been converted to a mask. - if (OperIsConvertVectorToMask()) + if (OperIsHWIntrinsic()) { - assert(AsHWIntrinsic()->Op(1)->OperIsTrueMask()); - - GenTree* op2 = AsHWIntrinsic()->Op(2); - if (op2->IsCnsVec() && op2->AsVecCon()->IsZero()) + NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); + if (id == NI_Sve_ConvertMaskToVector) { - return true; + GenTree* op1 = AsHWIntrinsic()->Op(1); + assert(op1->OperIsHWIntrinsic()); + id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); } + return ((id >= NI_Sve_CreateFalseMaskByte) && (id <= NI_Sve_CreateFalseMaskUInt64)); } -#endif // TARGET_ARM64 +#endif return false; } diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 38d8be7fb203cc..019e1115f9bb13 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -3959,6 +3959,14 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) GenTree* op2 = intrin.op2; GenTree* op3 = intrin.op3; + // Handle op1 + if (op1->IsVectorZero()) + { + // When we are merging with zero, we can specialize + // and avoid instantiating the vector constant. + MakeSrcContained(node, op1); + } + // Handle op2 if (op2->OperIsHWIntrinsic() && !op2->IsEmbMaskOp()) { @@ -3974,8 +3982,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // the operation MakeSrcContained(node, op2); op2->MakeEmbMaskOp(); - JITDUMP("Containing op2 inside ConditionalSelect\n"); - DISPTREERANGE(BlockRange(), op2); } else { @@ -3996,8 +4002,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { MakeSrcContained(node, op2); op2->MakeEmbMaskOp(); - JITDUMP("Containing convert op2 inside ConditionalSelect\n"); - DISPTREERANGE(BlockRange(), op2); } } } @@ -4010,25 +4014,17 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (embOp->Op(2)->IsCnsIntOrI()) { MakeSrcContained(op2, embOp->Op(2)); - JITDUMP("Containing ShiftRight op2 inside ConditionalSelect\n"); - DISPTREERANGE(BlockRange(), op2); } } } // Handle op3 - if (op3->IsVectorZero() && op1->IsMaskAllBitsSet() && op2->IsEmbMaskOp()) + if (op3->IsVectorZero() && op1->IsMaskAllBitsSet()) { // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. // Do this only if op1 was AllTrueMask MakeSrcContained(node, op3); - if (op3->OperIsConvertMaskToVector()) - { - MakeSrcContained(node, op3->AsHWIntrinsic()->Op(1)); - } - JITDUMP("Containing all true op3 inside ConditionalSelect\n"); - DISPTREERANGE(BlockRange(), op3); } break; From 65d987e9db6bcc7299176ec2052d37f20d014009 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 19 May 2025 16:34:31 +0100 Subject: [PATCH 03/62] Import constant vector 0 for createfalsemask --- src/coreclr/jit/gentree.h | 3 +- src/coreclr/jit/hwintrinsicarm64.cpp | 15 +++++++++ src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 4 +++ src/coreclr/jit/hwintrinsiclistarm64sve.h | 21 +++++++------ src/coreclr/jit/lower.h | 1 + src/coreclr/jit/lowerarmarch.cpp | 31 +++++++++++++++++++ .../JIT/opt/MaskConversions/ConstantMasks.cs | 3 +- 7 files changed, 65 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index f3fb94b09429e8..270b95e49c9b98 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -9748,7 +9748,8 @@ inline bool GenTree::IsMaskZero() const assert(op1->OperIsHWIntrinsic()); id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); } - return ((id >= NI_Sve_CreateFalseMaskByte) && (id <= NI_Sve_CreateFalseMaskUInt64)); + return ((id == NI_Sve_CreateFalseMaskAll) || + ((id >= NI_Sve_CreateFalseMaskByte) && (id <= NI_Sve_CreateFalseMaskUInt64))); } #endif diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index a9c50c029cc22f..4e55338d0843d9 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2757,6 +2757,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Sve_CreateFalseMaskByte: + case NI_Sve_CreateFalseMaskDouble: + case NI_Sve_CreateFalseMaskInt16: + case NI_Sve_CreateFalseMaskInt32: + case NI_Sve_CreateFalseMaskInt64: + case NI_Sve_CreateFalseMaskSByte: + case NI_Sve_CreateFalseMaskSingle: + case NI_Sve_CreateFalseMaskUInt16: + case NI_Sve_CreateFalseMaskUInt32: + case NI_Sve_CreateFalseMaskUInt64: + { + retNode = gtNewZeroConNode(TYP_SIMD16); + break; + } + case NI_Sve_Load2xVectorAndUnzip: case NI_Sve_Load3xVectorAndUnzip: case NI_Sve_Load4xVectorAndUnzip: diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 9bb7a1e4f39356..4b1020062ceb45 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -2025,6 +2025,10 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_Sve_CreateFalseMaskAll: + GetEmitter()->emitInsSve_R(ins, emitSize, targetReg, opt); + break; + case NI_Sve_CreateTrueMaskAll: // Must use the pattern variant, as the non-pattern varient is SVE2.1. GetEmitter()->emitIns_R_PATTERN(ins, emitSize, targetReg, opt, SVE_PATTERN_ALL); diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h index 9bb76b0ad038a5..ef75f387cbbd13 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64sve.h +++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h @@ -63,16 +63,16 @@ HARDWARE_INTRINSIC(Sve, CreateBreakAfterPropagateMask, HARDWARE_INTRINSIC(Sve, CreateBreakBeforeMask, -1, 2, {INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_sve_brkb, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, CreateBreakBeforePropagateMask, -1, 3, {INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_sve_brkpb, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialCodeGen|HW_Flag_ZeroingMaskedOperation) HARDWARE_INTRINSIC(Sve, CreateBreakPropagateMask, -1, -1, {INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_sve_brkn, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_HasRMWSemantics|HW_Flag_ZeroingMaskedOperation) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskByte, -1, 0, {INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskDouble, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt16, -1, 0, {INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt32, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt64, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskSByte, -1, 0, {INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskSingle, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt16, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt32, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt64, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskByte, -1, 0, {INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskDouble, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt16, -1, 0, {INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt32, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskInt64, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskSByte, -1, 0, {INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskSingle, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt16, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt32, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt64, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Sve, CreateMaskForFirstActiveElement, -1, 2, {INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, CreateMaskForNextActiveElement, -1, 2, {INS_invalid, INS_sve_pnext, INS_invalid, INS_sve_pnext, INS_invalid, INS_sve_pnext, INS_invalid, INS_sve_pnext, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, CreateTrueMaskByte, -1, 1, {INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) @@ -314,6 +314,7 @@ HARDWARE_INTRINSIC(Sve, ConditionalExtractAfterLastActiveElementScalar HARDWARE_INTRINSIC(Sve, ConditionalExtractLastActiveElementScalar, 0, 3, {INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_HasRMWSemantics|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ConvertMaskToVector, -1, 1, {INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov}, HW_Category_Helper, HW_Flag_Scalable) HARDWARE_INTRINSIC(Sve, ConvertVectorToMask, -1, 2, {INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne}, HW_Category_Helper, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_LowMaskedOperation) +HARDWARE_INTRINSIC(Sve, CreateFalseMaskAll, -1, 0, {INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse}, HW_Category_Helper, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, CreateTrueMaskAll, -1, 0, {INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue}, HW_Category_Helper, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) // Scalar variants of Saturating*By*BitElementCount. There is 8bit versions as the generic version is scalar only. HARDWARE_INTRINSIC(Sve, SaturatingDecrementBy16BitElementCountScalar, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sqdech, INS_sve_uqdech, INS_sve_sqdech, INS_sve_uqdech, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_HasRMWSemantics) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 388e136f3f7812..a93e4b1b97fb1c 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -443,6 +443,7 @@ class Lowering final : public Phase GenTree* LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicDot(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node); + GenTree* LowerHWIntrinsicConvertVectorToMask(GenTreeHWIntrinsic* node); #if defined(TARGET_XARCH) void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 019e1115f9bb13..57ee06a596adbc 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1825,13 +1825,19 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AdvSimd_FusedMultiplyAddScalar: LowerHWIntrinsicFusedMultiplyAddScalar(node); break; + case NI_Sve_ConditionalSelect: return LowerHWIntrinsicCndSel(node); + + case NI_Sve_ConvertVectorToMask: + return LowerHWIntrinsicConvertVectorToMask(node); + case NI_Sve_SetFfr: { StoreFFRValue(node); break; } + case NI_Sve_GetFfrByte: case NI_Sve_GetFfrInt16: case NI_Sve_GetFfrInt32: @@ -4227,6 +4233,31 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) return cndSelNode->gtNext; } + +GenTree* Lowering::LowerHWIntrinsicConvertVectorToMask(GenTreeHWIntrinsic* mask) +{ + assert(mask->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)); + + GenTree* op1 = mask->Op(1); + GenTree* op2 = mask->Op(2); + + if(op2->IsVectorZero()) + { + // Transform ConvertVectorToMask(..., ConstVec(0)) to FalseMask + + op1->SetUnusedValue(); + op2->SetUnusedValue(); + mask->ResetHWIntrinsicId(NI_Sve_CreateFalseMaskAll, comp); + + JITDUMP("lowering ConvertVectorToMask(ZeroVector) to FalseMask:\n"); + DISPTREERANGE(BlockRange(), mask); + JITDUMP("\n"); + } + + return mask->gtNext; +} + + #if defined(TARGET_ARM64) //---------------------------------------------------------------------------------------------- // StoreFFRValue: For hwintrinsic that produce a first faulting register (FFR) value, create diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs b/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs index a23bba250d0d2b..96ef8c631e982c 100644 --- a/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs @@ -76,8 +76,7 @@ static void CndSelectEmbeddedFalseMask(Vector op1, Vector op2) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedZero(Vector op1, Vector op2) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AbsoluteDifference(op1, op2), op1); Consume(result); From c93db61c70e448cbb5114aad10664a841c389ffc Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 19 May 2025 17:04:23 +0100 Subject: [PATCH 04/62] fix up tests --- .../JIT/opt/MaskConversions/ConstantMasks.cs | 21 ++++++------------- .../MaskConversions/ConstantMasksOp2Fixed.cs | 4 ++-- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs b/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs index 96ef8c631e982c..d3e15f0f5bb5f4 100644 --- a/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs @@ -121,8 +121,7 @@ static void CndSelectOptionalEmbeddedFalseMask(Vector op1, Vector op2) [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedZero(Vector op1, Vector op2) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b //ARM64-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.Add(op1, op2), op1); Consume(result); @@ -137,7 +136,7 @@ static void CndSelectOptionalEmbeddedTrueMask(Vector op1, Vector op2) [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedAllBits(Vector op1, Vector op2) { - //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.Add(op1, op2), op1); Consume(result); } @@ -164,8 +163,7 @@ static void CndSelectEmbeddedOneOpFalseMask(Vector op1) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedOneOpZero(Vector op1) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b //ARM64-FULL-LINE-NEXT: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.Abs(op1), op1); Consume(result); @@ -196,7 +194,6 @@ static void CndSelectEmbeddedOneOpAllBits(Vector op1) { static void CndSelectEmbeddedReduction(Vector mask, Vector op1, Vector opf) { //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d Vector result = Sve.ConditionalSelect(mask, Sve.AddAcross(op1), opf); @@ -208,7 +205,6 @@ static void CndSelectEmbeddedReductionFalseMask(Vector op1, Vector op //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt64(), Sve.AddAcross(op1), opf); @@ -218,9 +214,7 @@ static void CndSelectEmbeddedReductionFalseMask(Vector op1, Vector op [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionZero(Vector op1, Vector opf) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 - //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AddAcross(op1), opf); @@ -229,12 +223,10 @@ static void CndSelectEmbeddedReductionZero(Vector op1, Vector opf) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionTrueMask(Vector op1, Vector opf) { - //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d - Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt64(), Sve.AddAcross(op1), opf); + Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt64(), Sve.AddAcross(op1), opf); Consume(result); } @@ -242,7 +234,6 @@ static void CndSelectEmbeddedReductionTrueMask(Vector op1, Vector opf static void CndSelectEmbeddedReductionAllBits(Vector op1, Vector opf) { //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AddAcross(op1), opf); diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs index 6af931b072bb39..f03ddd2b1a9037 100644 --- a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs @@ -253,10 +253,10 @@ static void CndSelectOptionalEmbeddedAllBitsZ(Vector op1, Vector op2) [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionF(Vector mask, Vector op1) { + //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d Vector result = Sve.ConditionalSelect(mask, Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); Consume(result); From ae4847b68a5665a4d5d01cdd13c6c487c4cd3e5c Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 20 May 2025 14:47:48 +0100 Subject: [PATCH 05/62] Only allow zero op3 contains for embedded ops --- src/coreclr/jit/lowerarmarch.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 57ee06a596adbc..e0a968a4c67504 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -3971,6 +3971,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. MakeSrcContained(node, op1); + JITDUMP("Containing vector zero op1 inside ConditionalSelect\n"); + DISPTREERANGE(BlockRange(), op1); } // Handle op2 @@ -3988,6 +3990,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // the operation MakeSrcContained(node, op2); op2->MakeEmbMaskOp(); + JITDUMP("Containing op2 inside ConditionalSelect\n"); + DISPTREERANGE(BlockRange(), op2); } else { @@ -4008,6 +4012,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { MakeSrcContained(node, op2); op2->MakeEmbMaskOp(); + JITDUMP("Containing convert op2 inside ConditionalSelect\n"); + DISPTREERANGE(BlockRange(), op2); } } } @@ -4020,17 +4026,21 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (embOp->Op(2)->IsCnsIntOrI()) { MakeSrcContained(op2, embOp->Op(2)); + JITDUMP("Containing ShiftRight op2 inside ConditionalSelect\n"); + DISPTREERANGE(BlockRange(), op2); } } } // Handle op3 - if (op3->IsVectorZero() && op1->IsMaskAllBitsSet()) + if (op3->IsVectorZero() && op1->IsMaskAllBitsSet() && op2->IsEmbMaskOp()) { // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. // Do this only if op1 was AllTrueMask MakeSrcContained(node, op3); + JITDUMP("Containing vector zero op3 inside ConditionalSelect\n"); + DISPTREERANGE(BlockRange(), op3); } break; From 0c2316d279b37a34330b42f3b22bb46a37d15d65 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 20 May 2025 15:05:22 +0100 Subject: [PATCH 06/62] fix up tests --- .../MaskConversions/ConstantMasksOp2Fixed.cs | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs index f03ddd2b1a9037..75ef53a74e0220 100644 --- a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs @@ -278,8 +278,7 @@ static void CndSelectEmbeddedReductionFalseMaskF(Vector op1) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt64(), Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); Consume(result); @@ -302,8 +301,7 @@ static void CndSelectEmbeddedReductionZeroF(Vector op1) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); Consume(result); @@ -312,8 +310,7 @@ static void CndSelectEmbeddedReductionZeroF(Vector op1) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionZeroZ(Vector op1) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 - //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d @@ -325,8 +322,7 @@ static void CndSelectEmbeddedReductionZeroZ(Vector op1) { static void CndSelectEmbeddedReductionTrueMaskF(Vector op1) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt64(), Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); Consume(result); @@ -344,10 +340,10 @@ static void CndSelectEmbeddedReductionTrueMaskZ(Vector op1) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionAllBitsF(Vector op1) { - //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 + //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: mov {{z[0-9]+}}.d, {{p[0-9]+}}/z, #1 + //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); Consume(result); From 63af10016e16d319353a501b5bfb20511d68e695 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 20 May 2025 16:38:48 +0100 Subject: [PATCH 07/62] fix formatting --- src/coreclr/jit/lowerarmarch.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index e0a968a4c67504..92d7d715ff5535 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -4243,7 +4243,6 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) return cndSelNode->gtNext; } - GenTree* Lowering::LowerHWIntrinsicConvertVectorToMask(GenTreeHWIntrinsic* mask) { assert(mask->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)); @@ -4251,7 +4250,7 @@ GenTree* Lowering::LowerHWIntrinsicConvertVectorToMask(GenTreeHWIntrinsic* mask) GenTree* op1 = mask->Op(1); GenTree* op2 = mask->Op(2); - if(op2->IsVectorZero()) + if (op2->IsVectorZero()) { // Transform ConvertVectorToMask(..., ConstVec(0)) to FalseMask @@ -4267,7 +4266,6 @@ GenTree* Lowering::LowerHWIntrinsicConvertVectorToMask(GenTreeHWIntrinsic* mask) return mask->gtNext; } - #if defined(TARGET_ARM64) //---------------------------------------------------------------------------------------------- // StoreFFRValue: For hwintrinsic that produce a first faulting register (FFR) value, create From cac18d070018ec05dcf18d8107440d0fa88d325a Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 21 May 2025 12:11:02 +0100 Subject: [PATCH 08/62] Import constant vector all bits set for createtruemask --- src/coreclr/jit/hwintrinsicarm64.cpp | 32 +++++++++++++++++++++++ src/coreclr/jit/hwintrinsiclistarm64sve.h | 20 +++++++------- src/coreclr/jit/lowerarmarch.cpp | 18 +++++++++++-- 3 files changed, 58 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 4e55338d0843d9..d330e6b27c3f00 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2768,10 +2768,42 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Sve_CreateFalseMaskUInt32: case NI_Sve_CreateFalseMaskUInt64: { + // Import as a constant vector 0 retNode = gtNewZeroConNode(TYP_SIMD16); break; } + case NI_Sve_CreateTrueMaskByte: + case NI_Sve_CreateTrueMaskDouble: + case NI_Sve_CreateTrueMaskInt16: + case NI_Sve_CreateTrueMaskInt32: + case NI_Sve_CreateTrueMaskInt64: + case NI_Sve_CreateTrueMaskSByte: + case NI_Sve_CreateTrueMaskSingle: + case NI_Sve_CreateTrueMaskUInt16: + case NI_Sve_CreateTrueMaskUInt32: + case NI_Sve_CreateTrueMaskUInt64: + { + assert(sig->numArgs == 1); + op1 = impPopStack().val; + + if (op1->IsIntegralConst(31)) + { + // This is considered to be an all true mask. Import as a constant vector all bits set. + // TODO: Depending on the vector length, we may be able to consider other patterns + // as all true mask. + retNode = gtNewAllBitsSetConNode(TYP_SIMD16); + } + else + { + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + + // Do the convert to vector here (as the instrinsic is not marked with HW_Flag_ReturnsPerElementMask) + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode->AsHWIntrinsic(), simdBaseJitType, simdSize); + } + break; + } + case NI_Sve_Load2xVectorAndUnzip: case NI_Sve_Load3xVectorAndUnzip: case NI_Sve_Load4xVectorAndUnzip: diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h index ef75f387cbbd13..46af5200c7305f 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64sve.h +++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h @@ -75,16 +75,16 @@ HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt32, HARDWARE_INTRINSIC(Sve, CreateFalseMaskUInt64, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_pfalse, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Sve, CreateMaskForFirstActiveElement, -1, 2, {INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_sve_pfirst, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, CreateMaskForNextActiveElement, -1, 2, {INS_invalid, INS_sve_pnext, INS_invalid, INS_sve_pnext, INS_invalid, INS_sve_pnext, INS_invalid, INS_sve_pnext, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_HasRMWSemantics) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskByte, -1, 1, {INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskDouble, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt16, -1, 1, {INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskSByte, -1, 1, {INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskSingle, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt16, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskByte, -1, 1, {INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskDouble, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt16, -1, 1, {INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskSByte, -1, 1, {INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskSingle, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt16, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, CreateTrueMaskUInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ptrue, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Sve, CreateWhileLessThanMask16Bit, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_sve_whilelt, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, CreateWhileLessThanMask32Bit, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_whilelt, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, CreateWhileLessThanMask64Bit, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_whilelt, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 92d7d715ff5535..c1417e3bb8c2c0 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -4254,14 +4254,28 @@ GenTree* Lowering::LowerHWIntrinsicConvertVectorToMask(GenTreeHWIntrinsic* mask) { // Transform ConvertVectorToMask(..., ConstVec(0)) to FalseMask - op1->SetUnusedValue(); - op2->SetUnusedValue(); + assert(op1->OperIsHWIntrinsic(NI_Sve_CreateTrueMaskAll)); + BlockRange().Remove(op1); + BlockRange().Remove(op2); mask->ResetHWIntrinsicId(NI_Sve_CreateFalseMaskAll, comp); JITDUMP("lowering ConvertVectorToMask(ZeroVector) to FalseMask:\n"); DISPTREERANGE(BlockRange(), mask); JITDUMP("\n"); } + if (op2->IsVectorAllBitsSet()) + { + // Transform ConvertVectorToMask(..., ConstVec(11111...)) to TrueMask + + assert(op1->OperIsHWIntrinsic(NI_Sve_CreateTrueMaskAll)); + BlockRange().Remove(op1); + BlockRange().Remove(op2); + mask->ResetHWIntrinsicId(NI_Sve_CreateTrueMaskAll, comp); + + JITDUMP("lowering ConvertVectorToMask(ZeroVector) to TrueMask:\n"); + DISPTREERANGE(BlockRange(), mask); + JITDUMP("\n"); + } return mask->gtNext; } From c97d8a341c840850d08f1b94f69a0ba2770addbd Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 21 May 2025 12:18:12 +0100 Subject: [PATCH 09/62] Fix up tests --- src/tests/JIT/opt/MaskConversions/ConstantMasks.cs | 13 +++++-------- .../opt/MaskConversions/ConstantMasksOp2Fixed.cs | 6 ++---- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs b/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs index d3e15f0f5bb5f4..e1d78e2a250930 100644 --- a/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs @@ -92,8 +92,7 @@ static void CndSelectEmbeddedTrueMask(Vector op1, Vector op2) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedAllBits(Vector op1, Vector op2) { - //ARM64-FULL-LINE: mvni {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AbsoluteDifference(op1, op2), op1); Consume(result); @@ -136,7 +135,7 @@ static void CndSelectOptionalEmbeddedTrueMask(Vector op1, Vector op2) [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedAllBits(Vector op1, Vector op2) { - //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.Add(op1, op2), op1); Consume(result); } @@ -179,9 +178,8 @@ static void CndSelectEmbeddedOneOpTrueMask(Vector op1) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedOneOpAllBits(Vector op1) { - //ARM64-FULL-LINE: mvni {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 - //ARM64-FULL-LINE-NEXT: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.Abs(op1), op1); Consume(result); } @@ -232,8 +230,7 @@ static void CndSelectEmbeddedReductionTrueMask(Vector op1, Vector opf [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionAllBits(Vector op1, Vector opf) { - //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 - //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AddAcross(op1), opf); diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs index 75ef53a74e0220..47e63da38dc5e9 100644 --- a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs @@ -340,8 +340,7 @@ static void CndSelectEmbeddedReductionTrueMaskZ(Vector op1) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionAllBitsF(Vector op1) { - //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 - //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d @@ -351,8 +350,7 @@ static void CndSelectEmbeddedReductionAllBitsF(Vector op1) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionAllBitsZ(Vector op1) { - //ARM64-FULL-LINE: cmpne {{p[0-9]+}}.d, {{p[0-9]+}}/z, {{z[0-9]+}}.d, #0 - //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d From 32ac0d33f64b4168b47e5da571f32d2d2da4a535 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 21 May 2025 12:36:06 +0100 Subject: [PATCH 10/62] fix type of true mask variants --- src/coreclr/jit/hwintrinsicarm64.cpp | 2 +- src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index d330e6b27c3f00..0fc5682c959933 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2796,7 +2796,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } else { - retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); // Do the convert to vector here (as the instrinsic is not marked with HW_Flag_ReturnsPerElementMask) retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode->AsHWIntrinsic(), simdBaseJitType, simdSize); diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs index 47e63da38dc5e9..c2b16ac691930e 100644 --- a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs +++ b/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs @@ -51,12 +51,6 @@ public static void TestEntryPoint() CndSelectOptionalEmbeddedAllBitsF(op1, op2); CndSelectOptionalEmbeddedAllBitsZ(op1, op2); - // CndSelectEmbeddedOneOp(op1, op2); - // CndSelectEmbeddedOneOpFalseMask(op1); - // CndSelectEmbeddedOneOpZero(op1); - // CndSelectEmbeddedOneOpTrueMask(op1); - // CndSelectEmbeddedOneOpAllBits(op1); - CndSelectEmbeddedReductionF(opl1, op2); CndSelectEmbeddedReductionZ(opl1, op2); CndSelectEmbeddedReductionFalseMaskF(op1); From 7204965330133cafe0942b405820f58f37da1348 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 21 May 2025 17:05:19 +0100 Subject: [PATCH 11/62] Allow common code to create the convert for CreateTrueMask* --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/hwintrinsic.cpp | 3 +-- src/coreclr/jit/hwintrinsicarm64.cpp | 19 ++++++++++--------- src/coreclr/jit/hwintrinsicxarch.cpp | 15 ++++++++------- 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 891b33a169e46d..edccd2d874f49a 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -4702,7 +4702,7 @@ class Compiler CORINFO_SIG_INFO* sig R2RARG(CORINFO_CONST_LOOKUP* entryPoint), CorInfoType simdBaseJitType, - var_types retType, + var_types* retType, unsigned simdSize, bool mustExpand); diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index a00d57962d757b..ca6c77e7876994 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -2306,7 +2306,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } else { - retNode = impSpecialIntrinsic(intrinsic, clsHnd, method, sig R2RARG(entryPoint), simdBaseJitType, nodeRetType, + retNode = impSpecialIntrinsic(intrinsic, clsHnd, method, sig R2RARG(entryPoint), simdBaseJitType, &nodeRetType, simdSize, mustExpand); } @@ -2383,7 +2383,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, if (retType != nodeRetType) { // HWInstrinsic returns a mask, but all returns must be vectors, so convert mask to vector. - assert(HWIntrinsicInfo::ReturnsPerElementMask(intrinsic)); assert(nodeRetType == TYP_MASK); GenTreeHWIntrinsic* op = retNode->AsHWIntrinsic(); diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 0fc5682c959933..e55efc85997518 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -639,7 +639,7 @@ GenTree* Compiler::impNonConstFallback(NamedIntrinsic intrinsic, var_types simdT // sig -- signature of the intrinsic call. // entryPoint -- The entry point information required for R2R scenarios // simdBaseJitType -- generic argument of the intrinsic. -// retType -- return type of the intrinsic. +// pRetType -- return type of the intrinsic. May be updated. // mustExpand -- true if the intrinsic must return a GenTree*; otherwise, false // // Return Value: @@ -650,12 +650,13 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HANDLE method, CORINFO_SIG_INFO* sig R2RARG(CORINFO_CONST_LOOKUP* entryPoint), CorInfoType simdBaseJitType, - var_types retType, + var_types* pRetType, unsigned simdSize, bool mustExpand) { const HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsic); const int numArgs = sig->numArgs; + var_types retType = *pRetType; // The vast majority of "special" intrinsics are Vector64/Vector128 methods. // The only exception is ArmBase.Yield which should be treated differently. @@ -2769,7 +2770,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Sve_CreateFalseMaskUInt64: { // Import as a constant vector 0 - retNode = gtNewZeroConNode(TYP_SIMD16); + retNode = gtNewZeroConNode(retType); break; } @@ -2785,21 +2786,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Sve_CreateTrueMaskUInt64: { assert(sig->numArgs == 1); + op1 = impPopStack().val; if (op1->IsIntegralConst(31)) { // This is considered to be an all true mask. Import as a constant vector all bits set. // TODO: Depending on the vector length, we may be able to consider other patterns - // as all true mask. - retNode = gtNewAllBitsSetConNode(TYP_SIMD16); + // as all true mask, however these will not be commonly used. + retNode = gtNewAllBitsSetConNode(retType); } else { - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); - - // Do the convert to vector here (as the instrinsic is not marked with HW_Flag_ReturnsPerElementMask) - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode->AsHWIntrinsic(), simdBaseJitType, simdSize); + // Create a node of TYP_MASK, making sure to update pRetType + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); + *pRetType = TYP_MASK; } break; } diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 6098fed4b4485c..590dd07190edb8 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1377,7 +1377,7 @@ GenTree* Compiler::impNonConstFallback(NamedIntrinsic intrinsic, var_types simdT // sig -- signature of the intrinsic call. // entryPoint -- The entry point information required for R2R scenarios // simdBaseJitType -- generic argument of the intrinsic. -// retType -- return type of the intrinsic. +// pRetType -- return type of the intrinsic. // mustExpand -- true if the intrinsic must return a GenTree*; otherwise, false // // Return Value: @@ -1397,15 +1397,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HANDLE method, CORINFO_SIG_INFO* sig R2RARG(CORINFO_CONST_LOOKUP* entryPoint), CorInfoType simdBaseJitType, - var_types retType, + var_types pRetType, unsigned simdSize, bool mustExpand) { - GenTree* retNode = nullptr; - GenTree* op1 = nullptr; - GenTree* op2 = nullptr; - GenTree* op3 = nullptr; - GenTree* op4 = nullptr; + GenTree* retNode = nullptr; + GenTree* op1 = nullptr; + GenTree* op2 = nullptr; + GenTree* op3 = nullptr; + GenTree* op4 = nullptr; + var_types retType = *pRetType; CORINFO_InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsic); From ed4ed9b6891aa83f920becf2b2c3eadff5d4dcb1 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 22 May 2025 09:52:35 +0100 Subject: [PATCH 12/62] Fix x86 build --- src/coreclr/jit/hwintrinsicxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 590dd07190edb8..df810aaf9df9de 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1397,7 +1397,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HANDLE method, CORINFO_SIG_INFO* sig R2RARG(CORINFO_CONST_LOOKUP* entryPoint), CorInfoType simdBaseJitType, - var_types pRetType, + var_types* pRetType, unsigned simdSize, bool mustExpand) { From 8512317a9d4bb64e5fdfdf1a038e380c396c1be6 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 22 May 2025 12:21:44 +0100 Subject: [PATCH 13/62] unique test names in templates --- .../Arm/Shared/SveLoadNonFaultingUnOpTest.template | 8 ++++---- .../Arm/Shared/SveLoadVectorMaskedTest.template | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/SveLoadNonFaultingUnOpTest.template b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/SveLoadNonFaultingUnOpTest.template index f5364238d58e01..db2416974cdfd4 100644 --- a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/SveLoadNonFaultingUnOpTest.template +++ b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/SveLoadNonFaultingUnOpTest.template @@ -25,7 +25,7 @@ namespace JIT.HardwareIntrinsics.Arm [Fact] public static void {TestName}() { - var test = new LoadUnaryOpTest__{TestName}(); + var test = new LoadNonFaultingUnaryOpTest__{TestName}(); if (test.IsSupported) { @@ -66,7 +66,7 @@ namespace JIT.HardwareIntrinsics.Arm } } - public sealed unsafe class LoadUnaryOpTest__{TestName} + public sealed unsafe class LoadNonFaultingUnaryOpTest__{TestName} { private struct DataTable { @@ -134,7 +134,7 @@ namespace JIT.HardwareIntrinsics.Arm return testStruct; } - public void RunStructFldScenario(LoadUnaryOpTest__{TestName} testClass) + public void RunStructFldScenario(LoadNonFaultingUnaryOpTest__{TestName} testClass) { var result = {Isa}.{Method}(({Op1BaseType}*)testClass._dataTable.inArray1Ptr); @@ -158,7 +158,7 @@ namespace JIT.HardwareIntrinsics.Arm private DataTable _dataTable; - public LoadUnaryOpTest__{TestName}() + public LoadNonFaultingUnaryOpTest__{TestName}() { Succeeded = true; diff --git a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/SveLoadVectorMaskedTest.template b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/SveLoadVectorMaskedTest.template index 6bec8d9481000a..829f9384c33610 100644 --- a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/SveLoadVectorMaskedTest.template +++ b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/SveLoadVectorMaskedTest.template @@ -24,7 +24,7 @@ namespace JIT.HardwareIntrinsics.Arm [Fact] public static void {TestName}() { - var test = new LoadUnaryOpTest__{TestName}(); + var test = new LoadVectorMaskTest__{TestName}(); if (test.IsSupported) { @@ -56,7 +56,7 @@ namespace JIT.HardwareIntrinsics.Arm } } - public sealed unsafe class LoadUnaryOpTest__{TestName} + public sealed unsafe class LoadVectorMaskTest__{TestName} { private struct DataTable { @@ -121,7 +121,7 @@ namespace JIT.HardwareIntrinsics.Arm return testStruct; } - public void RunStructFldScenario(LoadUnaryOpTest__{TestName} testClass) + public void RunStructFldScenario(LoadVectorMaskTest__{TestName} testClass) { {Op1VectorType}<{Op1BaseType}> loadMask = Sve.CreateTrueMask{RetBaseType}(SveMaskPattern.All); @@ -148,7 +148,7 @@ namespace JIT.HardwareIntrinsics.Arm private DataTable _dataTable; - public LoadUnaryOpTest__{TestName}() + public LoadVectorMaskTest__{TestName}() { Succeeded = true; From b895ddd7250e68186cf1065b18674133da56580e Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 22 May 2025 14:40:55 +0100 Subject: [PATCH 14/62] simpler lowering --- src/coreclr/jit/lowerarmarch.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index c1417e3bb8c2c0..bfff2d337be63f 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -3982,16 +3982,15 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (IsInvariantInRange(op2, node) && op2->isEmbeddedMaskingCompatibleHWIntrinsic()) { + bool contain = false; uint32_t maskSize = genTypeSize(node->GetSimdBaseType()); uint32_t operSize = genTypeSize(op2->AsHWIntrinsic()->GetSimdBaseType()); + if (maskSize == operSize) { // If the size of baseType of operation matches that of maskType, then contain // the operation - MakeSrcContained(node, op2); - op2->MakeEmbMaskOp(); - JITDUMP("Containing op2 inside ConditionalSelect\n"); - DISPTREERANGE(BlockRange(), op2); + contain = true; } else { @@ -4010,12 +4009,17 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) uint32_t auxSize = genTypeSize(embOp->GetAuxiliaryType()); if (maskSize == auxSize) { - MakeSrcContained(node, op2); - op2->MakeEmbMaskOp(); - JITDUMP("Containing convert op2 inside ConditionalSelect\n"); - DISPTREERANGE(BlockRange(), op2); + contain = true; } } + + if (contain) + { + MakeSrcContained(node, op2); + op2->MakeEmbMaskOp(); + JITDUMP("Containing op2 inside ConditionalSelect\n"); + DISPTREERANGE(BlockRange(), node); + } } // Handle intrinsics with embedded masks and immediate operands From de06326e8caeb0164da1f17b805d098f1ade4d74 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 22 May 2025 14:41:23 +0100 Subject: [PATCH 15/62] Don't remove embedded ops that may throw --- src/coreclr/jit/gentree.cpp | 10 +++++++++- src/coreclr/jit/gentree.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 6a062b02f2b12d..48d9afebb88e47 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7184,7 +7184,7 @@ bool GenTree::OperMayThrow(Compiler* comp) return true; } -#ifdef TARGET_XARCH +#if defined(TARGET_XARCH) NamedIntrinsic intrinsicId = this->AsHWIntrinsic()->GetHWIntrinsicId(); if (intrinsicId == NI_Vector128_op_Division || intrinsicId == NI_Vector256_op_Division || intrinsicId == NI_Vector512_op_Division) @@ -7192,6 +7192,14 @@ bool GenTree::OperMayThrow(Compiler* comp) assert(varTypeIsInt(AsHWIntrinsic()->GetSimdBaseType())); return true; } +#elif defined(TARGET_ARM64) + NamedIntrinsic intrinsicId = this->AsHWIntrinsic()->GetHWIntrinsicId(); + if (intrinsicId == NI_Sve_ConditionalSelect) + { + // If op2 is embedded, then check if that will throw. + GenTree* op2 = this->AsHWIntrinsic()->Op(2); + return (op2->IsEmbMaskOp() && op2->OperMayThrow(comp)); + } #endif // TARGET_XARCH } #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 270b95e49c9b98..66f6d9ad87645b 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -2066,6 +2066,7 @@ struct GenTree assert(IsValue()); gtFlags &= ~GTF_CONTAINED; ClearRegOptional(); + assert(!IsEmbMaskOp()); } bool CanCSE() const From 33674a0409c5b61f0dc74b587800ad24730a36e8 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 22 May 2025 15:01:03 +0100 Subject: [PATCH 16/62] Clear embOp when clearing contained --- src/coreclr/jit/gentree.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 66f6d9ad87645b..9fa9c7d6ac839f 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -2066,7 +2066,12 @@ struct GenTree assert(IsValue()); gtFlags &= ~GTF_CONTAINED; ClearRegOptional(); - assert(!IsEmbMaskOp()); +#ifdef FEATURE_HW_INTRINSICS + if (OperIsHWIntrinsic()) + { + ClearEmbMaskOp(); + } +#endif } bool CanCSE() const @@ -2289,6 +2294,12 @@ struct GenTree gtFlags |= GTF_HW_EM_OP; } + void ClearEmbMaskOp() + { + assert(OperIsHWIntrinsic()); + gtFlags &= ~GTF_HW_EM_OP; + } + #endif // FEATURE_HW_INTRINSICS static bool HandleKindDataIsInvariant(GenTreeFlags flags); From 5c0ae18e5960a494946c20caa134bb0e79edcca3 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 27 May 2025 16:41:05 +0100 Subject: [PATCH 17/62] Import masks as gtNewVconNode --- src/coreclr/jit/compiler.h | 1 + src/coreclr/jit/hwintrinsicarm64.cpp | 127 +++++++++++++++++++++++++-- 2 files changed, 120 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index edccd2d874f49a..7a6e6e7ff2083c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3130,6 +3130,7 @@ class Compiler #if defined(TARGET_ARM64) GenTree* gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize); + GenTree* gtNewSimdCnsVecTrueMaskPattern(var_types retType, int simdSize, var_types simdBaseType, int64_t pattern); #endif GenTree* gtNewSimdBinOpNode(genTreeOps op, diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index e55efc85997518..dce9c6edb210ce 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2770,7 +2770,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Sve_CreateFalseMaskUInt64: { // Import as a constant vector 0 - retNode = gtNewZeroConNode(retType); + GenTreeVecCon* vecCon = gtNewVconNode(retType); + vecCon->gtSimdVal = simd_t::Zero(); + retNode = vecCon; break; } @@ -2789,17 +2791,43 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impPopStack().val; - if (op1->IsIntegralConst(31)) + // Where possible, import a constant vector. + if (op1->IsIntegralConst()) { - // This is considered to be an all true mask. Import as a constant vector all bits set. - // TODO: Depending on the vector length, we may be able to consider other patterns - // as all true mask, however these will not be commonly used. - retNode = gtNewAllBitsSetConNode(retType); + int64_t pattern = op1->AsIntConCommon()->IntegralValue(); + switch (pattern) + { + case 0: // POW2 + case 1: // VL1 + case 2: // VL2 + case 3: // VL3 + case 4: // VL4 + case 5: // VL5 + case 6: // VL6 + case 7: // VL7 + case 8: // VL8 + case 9: // VL16 + case 10: // VL32 + case 11: // VL64 + case 12: // VL128 + case 13: // VL256 + case 29: // MUL4 + case 30: // MUL3 + case 31: // ALL + retNode = gtNewSimdCnsVecTrueMaskPattern(retType, simdSize, simdBaseType, pattern); + break; + + default: + // Invalid enum. + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); + *pRetType = TYP_MASK; + break; + } } else { // Create a node of TYP_MASK, making sure to update pRetType - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); *pRetType = TYP_MASK; } break; @@ -3296,7 +3324,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } //------------------------------------------------------------------------ -// gtNewSimdEmbeddedMaskNode: Create an embedded mask +// gtNewSimdAllTrueMaskNode: Create an embedded mask // // Arguments: // simdBaseJitType -- the base jit type of the nodes being masked @@ -3310,4 +3338,87 @@ GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigne return gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_CreateTrueMaskAll, simdBaseJitType, simdSize); } +//------------------------------------------------------------------------ +// gtNewSimdCnsVecTrueMaskPattern: Create a constant vector with a true mask bit pattern +// +// Arguments: +// retType -- return type of the intrinsic. +// simdSize -- the simd size of the nodes being created +// simdBaseJitType -- the base jit type of the nodes being created +// pattern -- The pattern to use as defined by the Arm PTRUE instruction +// +// Return Value: +// The node +// +GenTree* Compiler::gtNewSimdCnsVecTrueMaskPattern(var_types retType, + int simdSize, + var_types simdBaseType, + int64_t pattern) +{ + int64_t lanes = simdSize / genTypeSize(simdBaseType); + int64_t laneBits = genTypeSize(simdBaseType) * 8; + int64_t laneVal = (laneBits > 32) ? UINT64_MAX : (((int64_t)1 << laneBits) - 1); + + // Ensure the base type is integral + if (simdBaseType == TYP_DOUBLE) + { + simdBaseType = TYP_ULONG; + } + else if (simdBaseType == TYP_FLOAT) + { + simdBaseType = TYP_UINT; + } + + GenTreeVecCon* vecCon = gtNewVconNode(retType); + + int64_t lanesToFill = 0; + switch (pattern) + { + case 0: // POW2 - The largest power of 2 + case 31: // ALL - All lanes + lanesToFill = lanes; + break; + + case 1: // VL1 - exactly 1 lane, etc + case 2: // VL2 + case 3: // VL3 + case 4: // VL4 + case 5: // VL5 + case 6: // VL6 + case 7: // VL7 + case 8: // VL8 + lanesToFill = pattern; + break; + + case 9: // VL16 - exactly 16 lanes, etc + case 10: // VL32 + case 11: // VL64 + case 12: // VL128 + case 13: // VL256 + lanesToFill = ((pattern - 8) * 16); + break; + + case 29: // MUL4 - The largest multiple of 4 + lanesToFill = (lanes - (lanes % 4)); + break; + + case 30: // MUL3 - The largest multiple of 3 + lanesToFill = (lanes - (lanes % 3)); + break; + + default: + assert(false); + break; + } + + lanesToFill = std::min(lanesToFill, lanes); + + for (int index = 0; index < lanesToFill; index++) + { + vecCon->SetElementIntegral(simdBaseType, index, laneVal); + } + + return vecCon; +} + #endif // FEATURE_HW_INTRINSICS From 239b82d239fb9a23a0ae57628f5ea44b25f4b5b8 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 27 May 2025 17:22:56 +0100 Subject: [PATCH 18/62] Remove pRetType --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/hwintrinsic.cpp | 10 +++++++--- src/coreclr/jit/hwintrinsicarm64.cpp | 13 +++++-------- src/coreclr/jit/hwintrinsicxarch.cpp | 15 +++++++-------- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 7a6e6e7ff2083c..bcb9b18fcb47c2 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -4703,7 +4703,7 @@ class Compiler CORINFO_SIG_INFO* sig R2RARG(CORINFO_CONST_LOOKUP* entryPoint), CorInfoType simdBaseJitType, - var_types* retType, + var_types retType, unsigned simdSize, bool mustExpand); diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index ca6c77e7876994..09558653ed343f 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -2306,8 +2306,13 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } else { - retNode = impSpecialIntrinsic(intrinsic, clsHnd, method, sig R2RARG(entryPoint), simdBaseJitType, &nodeRetType, + retNode = impSpecialIntrinsic(intrinsic, clsHnd, method, sig R2RARG(entryPoint), simdBaseJitType, nodeRetType, simdSize, mustExpand); + +#if defined(FEATURE_MASKED_HW_INTRINSICS) && defined(TARGET_ARM64) + // The special import may have switched the type of the node. + nodeRetType = retNode->gtType; +#endif } if (setMethodHandle && (retNode != nullptr)) @@ -2380,10 +2385,9 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } } - if (retType != nodeRetType) + if (nodeRetType == TYP_MASK) { // HWInstrinsic returns a mask, but all returns must be vectors, so convert mask to vector. - assert(nodeRetType == TYP_MASK); GenTreeHWIntrinsic* op = retNode->AsHWIntrinsic(); diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index dce9c6edb210ce..ac77cc21e7874c 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -639,7 +639,7 @@ GenTree* Compiler::impNonConstFallback(NamedIntrinsic intrinsic, var_types simdT // sig -- signature of the intrinsic call. // entryPoint -- The entry point information required for R2R scenarios // simdBaseJitType -- generic argument of the intrinsic. -// pRetType -- return type of the intrinsic. May be updated. +// retType -- return type of the intrinsic. // mustExpand -- true if the intrinsic must return a GenTree*; otherwise, false // // Return Value: @@ -650,13 +650,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HANDLE method, CORINFO_SIG_INFO* sig R2RARG(CORINFO_CONST_LOOKUP* entryPoint), CorInfoType simdBaseJitType, - var_types* pRetType, + var_types retType, unsigned simdSize, bool mustExpand) { const HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsic); const int numArgs = sig->numArgs; - var_types retType = *pRetType; // The vast majority of "special" intrinsics are Vector64/Vector128 methods. // The only exception is ArmBase.Yield which should be treated differently. @@ -2791,7 +2790,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impPopStack().val; - // Where possible, import a constant vector. + // Where possible, import a constant vector to allow for optimisations. if (op1->IsIntegralConst()) { int64_t pattern = op1->AsIntConCommon()->IntegralValue(); @@ -2818,17 +2817,15 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; default: - // Invalid enum. + // Invalid enum, so generate the create true mask node. retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); - *pRetType = TYP_MASK; break; } } else { - // Create a node of TYP_MASK, making sure to update pRetType + // Do not know the pattern, so generate the create true mask node. retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); - *pRetType = TYP_MASK; } break; } diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index df810aaf9df9de..6098fed4b4485c 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1377,7 +1377,7 @@ GenTree* Compiler::impNonConstFallback(NamedIntrinsic intrinsic, var_types simdT // sig -- signature of the intrinsic call. // entryPoint -- The entry point information required for R2R scenarios // simdBaseJitType -- generic argument of the intrinsic. -// pRetType -- return type of the intrinsic. +// retType -- return type of the intrinsic. // mustExpand -- true if the intrinsic must return a GenTree*; otherwise, false // // Return Value: @@ -1397,16 +1397,15 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HANDLE method, CORINFO_SIG_INFO* sig R2RARG(CORINFO_CONST_LOOKUP* entryPoint), CorInfoType simdBaseJitType, - var_types* pRetType, + var_types retType, unsigned simdSize, bool mustExpand) { - GenTree* retNode = nullptr; - GenTree* op1 = nullptr; - GenTree* op2 = nullptr; - GenTree* op3 = nullptr; - GenTree* op4 = nullptr; - var_types retType = *pRetType; + GenTree* retNode = nullptr; + GenTree* op1 = nullptr; + GenTree* op2 = nullptr; + GenTree* op3 = nullptr; + GenTree* op4 = nullptr; CORINFO_InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsic); From ba2aa60964a106f10346dda435511b0360c78caa Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 28 May 2025 11:33:46 +0100 Subject: [PATCH 19/62] Add nullptr check --- src/coreclr/jit/hwintrinsic.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 2f4c865f71c74f..ff1177922f5cce 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -2446,8 +2446,11 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, simdSize, mustExpand); #if defined(FEATURE_MASKED_HW_INTRINSICS) && defined(TARGET_ARM64) - // The special import may have switched the type of the node. - nodeRetType = retNode->gtType; + if (retNode != nullptr) + { + // The special import may have switched the type of the node. + nodeRetType = retNode->gtType; + } #endif } From 86967988248b01fba1c79a5efd2d67aaeb46decf Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 28 May 2025 12:33:25 +0100 Subject: [PATCH 20/62] Add AOT TODO --- src/coreclr/jit/hwintrinsicarm64.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 3f364f7211cc97..9145c5d5b3c77d 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2883,6 +2883,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impPopStack().val; + // TODO: For AOT, always do the gtNewSimdHWIntrinsicNode as we don't know the vector size. + // Where possible, import a constant vector to allow for optimisations. if (op1->IsIntegralConst()) { From cc786a958157e1e7dc8772158b8d20a13da66f43 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 29 May 2025 08:22:10 +0100 Subject: [PATCH 21/62] Add codegen support for CNS_MASK --- src/coreclr/jit/codegenarm64.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 5ffb79df1f8448..c7fcd28c398bbd 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2338,6 +2338,29 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre break; } + + case GT_CNS_MSK: + { + GenTreeMskCon* mask = tree->AsMskCon(); + + emitter* emit = GetEmitter(); + // emitAttr attr = emitTypeSize(targetType); + // insOpts opt = emit::optGetSveInsOpt(emitTypeSize(intrin.baseType)); + + if (mask->IsAllBitsSet()) + { + emit->emitIns_R_PATTERN(INS_sve_ptrue, EA_SCALABLE, targetReg, INS_OPTS_SCALABLE_B, SVE_PATTERN_ALL); + } + else if (mask->IsZero()) + { + emit->emitInsSve_R(INS_sve_pfalse, EA_SCALABLE, targetReg, INS_OPTS_SCALABLE_B); + } + else + { + unreached(); + } + break; + } #endif // FEATURE_SIMD default: From 3690d19c475c6e6357220426c0839b9186fa66ea Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 2 Jun 2025 10:25:47 +0100 Subject: [PATCH 22/62] Add const vector folding for Arm64 --- src/coreclr/jit/compiler.h | 1 + src/coreclr/jit/gentree.cpp | 113 ++++++++++++++++++------------- src/coreclr/jit/lower.h | 1 - src/coreclr/jit/lowerarmarch.cpp | 40 ----------- 4 files changed, 67 insertions(+), 88 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 21979aa0303e6d..6d1fa2de97222d 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3702,6 +3702,7 @@ class Compiler #if defined(FEATURE_HW_INTRINSICS) GenTree* gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree); + GenTreeMskCon* gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, GenTreeVecCon* vecCon); #endif // FEATURE_HW_INTRINSICS // Options to control behavior of gtTryRemoveBoxUpstreamEffects diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 72dee32fe6e5b0..51a317b08ca6a6 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32118,6 +32118,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } #if defined(FEATURE_MASKED_HW_INTRINSICS) + //Fold ConvertMaskToVector(ConvertVectorToMask(vec)) to vec if (tree->OperIsConvertMaskToVector()) { GenTree* op = op1; @@ -32150,6 +32151,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } } + //Fold ConvertVectorToMask(ConvertMaskToVector(mask)) to mask if (tree->OperIsConvertVectorToMask()) { GenTree* op = op1; @@ -32248,58 +32250,17 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) resultNode = gtNewVconNode(retType, &simdVal); } - else if (tree->OperIsConvertVectorToMask()) - { - GenTreeVecCon* vecCon = cnsNode->AsVecCon(); - GenTreeMskCon* mskCon = gtNewMskConNode(retType); - - switch (vecCon->TypeGet()) - { - case TYP_SIMD8: - { - EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, vecCon->gtSimd8Val); - break; - } - - case TYP_SIMD12: - { - EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, vecCon->gtSimd12Val); - break; - } - - case TYP_SIMD16: - { - EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, vecCon->gtSimd16Val); - break; - } - -#if defined(TARGET_XARCH) - case TYP_SIMD32: - { - EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, vecCon->gtSimd32Val); - break; - } - - case TYP_SIMD64: - { - EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, vecCon->gtSimd64Val); - break; - } -#endif // TARGET_XARCH - - default: - { - unreached(); - } - } - - resultNode = mskCon; - } #endif // FEATURE_MASKED_HW_INTRINSICS else { switch (ni) { +#if defined(TARGET_AMD64) + case NI_EVEX_ConvertVectorToMask: + resultNode = gtFoldExprConvertVecCnsToMask(tree, vecCon); + break; +#endif // TARGET_AMD64 + #ifdef TARGET_ARM64 case NI_ArmBase_LeadingZeroCount: #else @@ -33137,6 +33098,10 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) switch (ni) { #ifdef TARGET_ARM64 + case NI_Sve_ConvertVectorToMask: + resultNode = gtFoldExprConvertVecCnsToMask(tree, cnsNode->AsVecCon()); + break; + case NI_AdvSimd_MultiplyByScalar: case NI_AdvSimd_Arm64_MultiplyByScalar: { @@ -33346,6 +33311,60 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } #endif // FEATURE_HW_INTRINSICS + +GenTreeMskCon* Compiler::gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, GenTreeVecCon* vecCon) +{ + assert(tree->OperIsConvertVectorToMask()); + assert(vecCon == tree->Op(1) || vecCon == tree->Op(2)); + + var_types retType = tree->TypeGet(); + var_types simdBaseType = tree->GetSimdBaseType(); + GenTreeMskCon* mskCon = gtNewMskConNode(retType); + + switch (vecCon->TypeGet()) + { + case TYP_SIMD8: + { + EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, vecCon->gtSimd8Val); + break; + } + + case TYP_SIMD12: + { + EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, vecCon->gtSimd12Val); + break; + } + + case TYP_SIMD16: + { + EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, vecCon->gtSimd16Val); + break; + } + +#if defined(TARGET_XARCH) + case TYP_SIMD32: + { + EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, vecCon->gtSimd32Val); + break; + } + + case TYP_SIMD64: + { + EvaluateSimdCvtVectorToMask(simdBaseType, &mskCon->gtSimdMaskVal, vecCon->gtSimd64Val); + break; + } +#endif // TARGET_XARCH + + default: + { + unreached(); + } + } + + return mskCon; +} + + //------------------------------------------------------------------------ // gtCanSkipCovariantStoreCheck: see if storing a ref type value to an array // can skip the array store covariance check. diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 5d0e12641d6470..019a2f2b48a4bd 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -449,7 +449,6 @@ class Lowering final : public Phase GenTree* LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicDot(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node); - GenTree* LowerHWIntrinsicConvertVectorToMask(GenTreeHWIntrinsic* node); #if defined(TARGET_XARCH) void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index d60ea16c2f4f3c..6bfe619a25fe7b 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1817,9 +1817,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Sve_ConditionalSelect: return LowerHWIntrinsicCndSel(node); - case NI_Sve_ConvertVectorToMask: - return LowerHWIntrinsicConvertVectorToMask(node); - case NI_Sve_SetFfr: { StoreFFRValue(node); @@ -4222,43 +4219,6 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) return cndSelNode->gtNext; } -GenTree* Lowering::LowerHWIntrinsicConvertVectorToMask(GenTreeHWIntrinsic* mask) -{ - assert(mask->OperIsHWIntrinsic(NI_Sve_ConvertVectorToMask)); - - GenTree* op1 = mask->Op(1); - GenTree* op2 = mask->Op(2); - - if (op2->IsVectorZero()) - { - // Transform ConvertVectorToMask(..., ConstVec(0)) to FalseMask - - assert(op1->OperIsHWIntrinsic(NI_Sve_CreateTrueMaskAll)); - BlockRange().Remove(op1); - BlockRange().Remove(op2); - mask->ResetHWIntrinsicId(NI_Sve_CreateFalseMaskAll, comp); - - JITDUMP("lowering ConvertVectorToMask(ZeroVector) to FalseMask:\n"); - DISPTREERANGE(BlockRange(), mask); - JITDUMP("\n"); - } - if (op2->IsVectorAllBitsSet()) - { - // Transform ConvertVectorToMask(..., ConstVec(11111...)) to TrueMask - - assert(op1->OperIsHWIntrinsic(NI_Sve_CreateTrueMaskAll)); - BlockRange().Remove(op1); - BlockRange().Remove(op2); - mask->ResetHWIntrinsicId(NI_Sve_CreateTrueMaskAll, comp); - - JITDUMP("lowering ConvertVectorToMask(ZeroVector) to TrueMask:\n"); - DISPTREERANGE(BlockRange(), mask); - JITDUMP("\n"); - } - - return mask->gtNext; -} - #if defined(TARGET_ARM64) //---------------------------------------------------------------------------------------------- // StoreFFRValue: For hwintrinsic that produce a first faulting register (FFR) value, create From 8a12a5ffb8b88740b2cc911953260553edc57231 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 2 Jun 2025 17:28:02 +0100 Subject: [PATCH 23/62] Add mask patterns --- src/coreclr/jit/codegenarm64.cpp | 33 +++++-- src/coreclr/jit/hwintrinsicarm64.cpp | 2 +- src/coreclr/jit/instr.h | 3 +- src/coreclr/jit/simd.h | 138 ++++++++++++++++++++++++++- 4 files changed, 160 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index c7fcd28c398bbd..2f51d108c7aa82 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2342,23 +2342,38 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre case GT_CNS_MSK: { GenTreeMskCon* mask = tree->AsMskCon(); - emitter* emit = GetEmitter(); - // emitAttr attr = emitTypeSize(targetType); - // insOpts opt = emit::optGetSveInsOpt(emitTypeSize(intrin.baseType)); - if (mask->IsAllBitsSet()) + // Try every type until a match is found + + if (mask->IsZero()) { - emit->emitIns_R_PATTERN(INS_sve_ptrue, EA_SCALABLE, targetReg, INS_OPTS_SCALABLE_B, SVE_PATTERN_ALL); + emit->emitInsSve_R(INS_sve_pfalse, EA_SCALABLE, targetReg, INS_OPTS_SCALABLE_B); + break; } - else if (mask->IsZero()) + + insOpts opt = INS_OPTS_SCALABLE_B; + SveMaskPattern pat = EvaluateSimdMaskPattern(TYP_BYTE, mask->gtSimdMaskVal); + + if (pat == SveMaskPatternNone) { - emit->emitInsSve_R(INS_sve_pfalse, EA_SCALABLE, targetReg, INS_OPTS_SCALABLE_B); + opt = INS_OPTS_SCALABLE_H; + pat = EvaluateSimdMaskPattern(TYP_SHORT, mask->gtSimdMaskVal); } - else + + if (pat == SveMaskPatternNone) { - unreached(); + opt = INS_OPTS_SCALABLE_S; + pat = EvaluateSimdMaskPattern(TYP_INT, mask->gtSimdMaskVal); } + + if (pat == SveMaskPatternNone) + { + opt = INS_OPTS_SCALABLE_D; + pat = EvaluateSimdMaskPattern(TYP_LONG, mask->gtSimdMaskVal); + } + + emit->emitIns_R_PATTERN(INS_sve_ptrue, EA_SCALABLE, targetReg, opt, (insSvePattern)pat); break; } #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 9145c5d5b3c77d..0939a47eb5dd45 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -3463,7 +3463,7 @@ GenTree* Compiler::gtNewSimdCnsVecTrueMaskPattern(var_types retType, { int64_t lanes = simdSize / genTypeSize(simdBaseType); int64_t laneBits = genTypeSize(simdBaseType) * 8; - int64_t laneVal = (laneBits > 32) ? UINT64_MAX : (((int64_t)1 << laneBits) - 1); + int64_t laneVal = 1; //(laneBits > 32) ? UINT64_MAX : (((int64_t)1 << laneBits) - 1); // Ensure the base type is integral if (simdBaseType == TYP_DOUBLE) diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index d7aa4d21bcebd2..06995c7615d84c 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -440,7 +440,8 @@ enum insSvePattern : unsigned SVE_PATTERN_VL256 = 13, // 256 elements. SVE_PATTERN_MUL4 = 29, // The largest multiple of 4. SVE_PATTERN_MUL3 = 30, // The largest multiple of 3. - SVE_PATTERN_ALL = 31 // All available (implicitly a multiple of two). + SVE_PATTERN_ALL = 31, // All available (implicitly a multiple of two). + SVE_PATTERN_INVALID = 14 }; // Prefetch operation specifier for SVE instructions such as prfb. diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index d0450fa91caff6..4a2a97cfa86e9c 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1526,7 +1526,7 @@ void EvaluateSimdCvtMaskToVector(TSimd* result, simdmask_t arg0) isSet = ((mask >> i) & 1) != 0; #elif defined(TARGET_ARM64) // For Arm64 we have count total bits to read, but - // they are sizeof(TBase) bits apart. We still set + // they are sizeof(TBase) bits apart. We set // the result element to AllBitsSet or Zero depending // on the corresponding mask bit @@ -1598,14 +1598,17 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) uint32_t count = sizeof(TSimd) / sizeof(TBase); uint64_t mask = 0; - TBase mostSignificantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); + TBase significantBit = 1; +#if defined(TARGET_XARCH) + significantBit = static_cast(1) << ((sizeof(TBase) * 8) - 1); +#endif for (uint32_t i = 0; i < count; i++) { TBase input0; memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase)); - if ((input0 & mostSignificantBit) != 0) + if ((input0 & significantBit) != 0) { #if defined(TARGET_XARCH) // For xarch we have count sequential bits to write @@ -1615,9 +1618,9 @@ void EvaluateSimdCvtVectorToMask(simdmask_t* result, TSimd arg0) mask |= static_cast(1) << i; #elif defined(TARGET_ARM64) // For Arm64 we have count total bits to write, but - // they are sizeof(TBase) bits apart. We still set + // they are sizeof(TBase) bits apart. We set // depending on if the corresponding input element - // has its most significant bit set + // has its least significant bit set mask |= static_cast(1) << (i * sizeof(TBase)); #else @@ -1670,6 +1673,131 @@ void EvaluateSimdCvtVectorToMask(var_types baseType, simdmask_t* result, TSimd a } } } + + +#if defined(TARGET_ARM64) + +enum SveMaskPattern +{ + SveMaskPatternLargestPowerOf2 = 0, // The largest power of 2. + SveMaskPatternVectorCount1 = 1, // Exactly 1 element. + SveMaskPatternVectorCount2 = 2, // Exactly 2 elements. + SveMaskPatternVectorCount3 = 3, // Exactly 3 elements. + SveMaskPatternVectorCount4 = 4, // Exactly 4 elements. + SveMaskPatternVectorCount5 = 5, // Exactly 5 elements. + SveMaskPatternVectorCount6 = 6, // Exactly 6 elements. + SveMaskPatternVectorCount7 = 7, // Exactly 7 elements. + SveMaskPatternVectorCount8 = 8, // Exactly 8 elements. + SveMaskPatternVectorCount16 = 9, // Exactly 16 elements. + SveMaskPatternVectorCount32 = 10, // Exactly 32 elements. + SveMaskPatternVectorCount64 = 11, // Exactly 64 elements. + SveMaskPatternVectorCount128 = 12, // Exactly 128 elements. + SveMaskPatternVectorCount256 = 13, // Exactly 256 elements. + SveMaskPatternLargestMultipleOf4 = 29, // The largest multiple of 4. + SveMaskPatternLargestMultipleOf3 = 30, // The largest multiple of 3. + SveMaskPatternAll = 31, // All available (implicitly a multiple of two). + SveMaskPatternNone = 14 // Invalid +}; + + +template +SveMaskPattern EvaluateSimdMaskPattern(simdmask_t arg0) +{ + uint32_t count = sizeof(TSimd) / sizeof(TBase); + + uint64_t mask; + memcpy(&mask, &arg0.u8[0], sizeof(uint64_t)); + uint32_t finalOne = count; + + // A mask pattern starts with zero of more 1s and then the rest of the mask is filled with 0s. + + // Find an unbroken sequence of 1s. + for (uint32_t i = 0; i < count; i++) + { + // For Arm64 we have count total bits to read, but + // they are sizeof(TBase) bits apart. We set + // the result element to AllBitsSet or Zero depending + // on the corresponding mask bit + + bool isSet = ((mask >> (i * sizeof(TBase))) & 1) != 0; + if(!isSet) + { + finalOne = i; + break; + } + } + + // Find an unbroken sequence of 0s. + for (uint32_t i = finalOne; i < count; i++) + { + // For Arm64 we have count total bits to read, but + // they are sizeof(TBase) bits apart. We set + // the result element to AllBitsSet or Zero depending + // on the corresponding mask bit + + bool isSet = ((mask >> (i * sizeof(TBase))) & 1) != 0; + if(isSet) + { + // Invalid sequence + return SveMaskPatternNone; + } + } + + if (finalOne == count) + { + return SveMaskPatternAll; + } + else if (finalOne >= SveMaskPatternVectorCount1 && finalOne <= SveMaskPatternVectorCount8) + { + return (SveMaskPattern)finalOne; + } + else + { + //TODO: Add other patterns as required. These probably won't be seen until we get + // to wider vector lengths. + return SveMaskPatternNone; + } +} + +template +SveMaskPattern EvaluateSimdMaskPattern(var_types baseType, simdmask_t arg0) +{ + switch (baseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + return EvaluateSimdMaskPattern(arg0); + } + + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + { + return EvaluateSimdMaskPattern(arg0); + } + + case TYP_BYTE: + case TYP_UBYTE: + { + return EvaluateSimdMaskPattern(arg0); + } + + case TYP_SHORT: + case TYP_USHORT: + { + return EvaluateSimdMaskPattern(arg0); + } + + default: + { + unreached(); + } + } +} +#endif // TARGET_ARM64 + #endif // FEATURE_MASKED_HW_INTRINSICS #ifdef FEATURE_SIMD From cd4d2c7861a5ceae6fb46df20afcc993596a9a9d Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 2 Jun 2025 18:20:35 +0100 Subject: [PATCH 24/62] Move tests to SVE --- src/tests/JIT/opt/{MaskConversions => SVE}/ConstantMasks.cs | 0 src/tests/JIT/opt/{MaskConversions => SVE}/ConstantMasks.csproj | 0 .../JIT/opt/{MaskConversions => SVE}/ConstantMasksOp2Fixed.cs | 0 .../JIT/opt/{MaskConversions => SVE}/ConstantMasksOp2Fixed.csproj | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename src/tests/JIT/opt/{MaskConversions => SVE}/ConstantMasks.cs (100%) rename src/tests/JIT/opt/{MaskConversions => SVE}/ConstantMasks.csproj (100%) rename src/tests/JIT/opt/{MaskConversions => SVE}/ConstantMasksOp2Fixed.cs (100%) rename src/tests/JIT/opt/{MaskConversions => SVE}/ConstantMasksOp2Fixed.csproj (100%) diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasks.cs b/src/tests/JIT/opt/SVE/ConstantMasks.cs similarity index 100% rename from src/tests/JIT/opt/MaskConversions/ConstantMasks.cs rename to src/tests/JIT/opt/SVE/ConstantMasks.cs diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasks.csproj b/src/tests/JIT/opt/SVE/ConstantMasks.csproj similarity index 100% rename from src/tests/JIT/opt/MaskConversions/ConstantMasks.csproj rename to src/tests/JIT/opt/SVE/ConstantMasks.csproj diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs b/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.cs similarity index 100% rename from src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.cs rename to src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.cs diff --git a/src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.csproj b/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.csproj similarity index 100% rename from src/tests/JIT/opt/MaskConversions/ConstantMasksOp2Fixed.csproj rename to src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.csproj From 3d176a500e55701453269969ee8494debcd8250a Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 3 Jun 2025 14:50:15 +0100 Subject: [PATCH 25/62] Add isTrueMask() Change-Id: I456498f06d454e6ed57ce935e195b721e2c6d225 --- src/coreclr/jit/gentree.cpp | 86 ++++++++++++++++++++- src/coreclr/jit/gentree.h | 56 +------------- src/coreclr/jit/hwintrinsicarm64.cpp | 4 +- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 4 +- src/coreclr/jit/lowerarmarch.cpp | 6 +- src/tests/JIT/opt/SVE/ConstantMasks.cs | 2 + 6 files changed, 96 insertions(+), 62 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 51a317b08ca6a6..b0699047fb5ac8 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33243,7 +33243,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } - if (op1->IsVectorAllBitsSet() || op1->IsMaskAllBitsSet()) + if (op1->IsVectorAllBitsSet() || op1->IsTrueMask(tree)) { if ((op3->gtFlags & GTF_SIDE_EFFECT) != 0) { @@ -33364,6 +33364,90 @@ GenTreeMskCon* Compiler::gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, return mskCon; } +//------------------------------------------------------------------------ +// IsTrueMask: Is the given node a true mask +// +// Arguments: +// parent - parent of the node +// +// Returns true if the node is a true mask for the given parent. +// +// Note that a byte true mask is different to an int true mask, therefore +// the usage of the mask (ie the type of the parent) needs to be taken into account. +// +bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const +{ + var_types ParentSimdBaseType = JitType2PreciseVarType(parent->GetSimdBaseJitType()); + +#ifdef TARGET_ARM64 + if (OperIsHWIntrinsic()) + { + NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); + if (id == NI_Sve_ConvertMaskToVector) + { + GenTree* op1 = AsHWIntrinsic()->Op(1); + assert(op1->OperIsHWIntrinsic()); + id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); + } + + // Only TrueMaskAlls will be imported + if (id != NI_Sve_CreateTrueMaskAll) + { + return false; + } + + // Only a valid true mask if the parent has the same base type + return genTypeSize(ParentSimdBaseType) == genTypeSize(JitType2PreciseVarType(AsHWIntrinsic()->GetSimdBaseJitType())); + } + else if (IsCnsMsk()) + { + switch (parent->gtType) + { + case TYP_SIMD8: + return SveMaskPatternAll == EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + + case TYP_SIMD12: + return SveMaskPatternAll == EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + + case TYP_SIMD16: + return SveMaskPatternAll == EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + + default: + unreached(); + } + } + +#endif + return false; +} + + +bool GenTree::IsMaskZero() const +{ +#ifdef TARGET_ARM64 + static_assert_no_msg(AreContiguous(NI_Sve_CreateFalseMaskByte, NI_Sve_CreateFalseMaskDouble, + NI_Sve_CreateFalseMaskInt16, NI_Sve_CreateFalseMaskInt32, + NI_Sve_CreateFalseMaskInt64, NI_Sve_CreateFalseMaskSByte, + NI_Sve_CreateFalseMaskSingle, NI_Sve_CreateFalseMaskUInt16, + NI_Sve_CreateFalseMaskUInt32, NI_Sve_CreateFalseMaskUInt64)); + + if (OperIsHWIntrinsic()) + { + NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); + if (id == NI_Sve_ConvertMaskToVector) + { + GenTree* op1 = AsHWIntrinsic()->Op(1); + assert(op1->OperIsHWIntrinsic()); + id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); + } + return ((id == NI_Sve_CreateFalseMaskAll) || + ((id >= NI_Sve_CreateFalseMaskByte) && (id <= NI_Sve_CreateFalseMaskUInt64))); + } + +#endif + return false; +} + //------------------------------------------------------------------------ // gtCanSkipCovariantStoreCheck: see if storing a ref type value to an array diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index c26ca1c8ec5c84..e92a0d48045871 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1818,8 +1818,8 @@ struct GenTree inline bool IsVectorCreate() const; inline bool IsVectorAllBitsSet() const; inline bool IsVectorBroadcast(var_types simdBaseType) const; - inline bool IsMaskAllBitsSet() const; - inline bool IsMaskZero() const; + bool IsTrueMask(GenTreeHWIntrinsic* parent) const; + bool IsMaskZero() const; inline uint64_t GetIntegralVectorConstElement(size_t index, var_types simdBaseType); @@ -9723,58 +9723,6 @@ inline bool GenTree::IsVectorBroadcast(var_types simdBaseType) const return false; } -inline bool GenTree::IsMaskAllBitsSet() const -{ -#ifdef TARGET_ARM64 - static_assert_no_msg(AreContiguous(NI_Sve_CreateTrueMaskByte, NI_Sve_CreateTrueMaskDouble, - NI_Sve_CreateTrueMaskInt16, NI_Sve_CreateTrueMaskInt32, - NI_Sve_CreateTrueMaskInt64, NI_Sve_CreateTrueMaskSByte, - NI_Sve_CreateTrueMaskSingle, NI_Sve_CreateTrueMaskUInt16, - NI_Sve_CreateTrueMaskUInt32, NI_Sve_CreateTrueMaskUInt64)); - - if (OperIsHWIntrinsic()) - { - NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); - if (id == NI_Sve_ConvertMaskToVector) - { - GenTree* op1 = AsHWIntrinsic()->Op(1); - assert(op1->OperIsHWIntrinsic()); - id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); - } - return ((id == NI_Sve_CreateTrueMaskAll) || - ((id >= NI_Sve_CreateTrueMaskByte) && (id <= NI_Sve_CreateTrueMaskUInt64))); - } - -#endif - return false; -} - -inline bool GenTree::IsMaskZero() const -{ -#ifdef TARGET_ARM64 - static_assert_no_msg(AreContiguous(NI_Sve_CreateFalseMaskByte, NI_Sve_CreateFalseMaskDouble, - NI_Sve_CreateFalseMaskInt16, NI_Sve_CreateFalseMaskInt32, - NI_Sve_CreateFalseMaskInt64, NI_Sve_CreateFalseMaskSByte, - NI_Sve_CreateFalseMaskSingle, NI_Sve_CreateFalseMaskUInt16, - NI_Sve_CreateFalseMaskUInt32, NI_Sve_CreateFalseMaskUInt64)); - - if (OperIsHWIntrinsic()) - { - NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); - if (id == NI_Sve_ConvertMaskToVector) - { - GenTree* op1 = AsHWIntrinsic()->Op(1); - assert(op1->OperIsHWIntrinsic()); - id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); - } - return ((id == NI_Sve_CreateFalseMaskAll) || - ((id >= NI_Sve_CreateFalseMaskByte) && (id <= NI_Sve_CreateFalseMaskUInt64))); - } - -#endif - return false; -} - //------------------------------------------------------------------- // GetIntegralVectorConstElement: Gets the value of a given element in an integral vector constant // diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 0939a47eb5dd45..0f56413003ef56 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2913,14 +2913,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, default: // Invalid enum, so generate the create true mask node. - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, NI_Sve_CreateTrueMaskAll, simdBaseJitType, simdSize); break; } } else { // Do not know the pattern, so generate the create true mask node. - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, NI_Sve_CreateTrueMaskAll, simdBaseJitType, simdSize); } break; } diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 3d5f51632223a1..e553cea06f8d60 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -512,7 +512,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // destination using /Z. assert((targetReg != embMaskOp2Reg) || (embMaskOp1Reg == embMaskOp2Reg)); - assert(intrin.op3->isContained() || !intrin.op1->IsMaskAllBitsSet()); + assert(intrin.op3->isContained() || !intrin.op1->IsTrueMask(node)); GetEmitter()->emitInsSve_R_R_R(INS_sve_movprfx, emitSize, targetReg, maskReg, embMaskOp1Reg, opt); } else @@ -610,7 +610,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { assert(intrin.op3->IsVectorZero()); - if (intrin.op1->isContained() || intrin.op1->IsMaskAllBitsSet()) + if (intrin.op1->isContained() || intrin.op1->IsTrueMask(node)) { // We already skip importing ConditionalSelect if op1 == trueAll, however // if we still see it here, it is because we wrapped the predicated instruction diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 6bfe619a25fe7b..0598936681dc4d 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -4023,7 +4023,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } // Handle op3 - if (op3->IsVectorZero() && op1->IsMaskAllBitsSet() && op2->IsEmbMaskOp()) + if (op3->IsVectorZero() && op1->IsTrueMask(node) && op2->IsEmbMaskOp()) { // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. @@ -4148,7 +4148,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) // op3 is all zeros. Such a Csel operation is absorbed into the instruction when emitted. Skip this // optimisation when the nestedOp is a reduce operation. - if (nestedOp1->IsMaskAllBitsSet() && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && + if (nestedOp1->IsTrueMask(cndSelNode) && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsVectorZero())) { GenTree* nestedOp2 = nestedCndSel->Op(2); @@ -4177,7 +4177,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) } } } - else if (op1->IsMaskAllBitsSet()) + else if (op1->IsTrueMask(cndSelNode)) { // Any case where op2 is not an embedded HWIntrinsic if (!op2->OperIsHWIntrinsic() || diff --git a/src/tests/JIT/opt/SVE/ConstantMasks.cs b/src/tests/JIT/opt/SVE/ConstantMasks.cs index e1d78e2a250930..bdf48a09c9ae92 100644 --- a/src/tests/JIT/opt/SVE/ConstantMasks.cs +++ b/src/tests/JIT/opt/SVE/ConstantMasks.cs @@ -85,6 +85,7 @@ static void CndSelectEmbeddedZero(Vector op1, Vector op2) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedTrueMask(Vector op1, Vector op2) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movprfx {{z[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.AbsoluteDifference(op1, op2), op1); Consume(result); @@ -93,6 +94,7 @@ static void CndSelectEmbeddedTrueMask(Vector op1, Vector op2) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedAllBits(Vector op1, Vector op2) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movprfx {{z[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AbsoluteDifference(op1, op2), op1); Consume(result); From 66322672289225bf36afbd7a75793337aac752d8 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 3 Jun 2025 15:42:47 +0100 Subject: [PATCH 26/62] fix tests Change-Id: I3d74a7292e2c880fcaba215dc1fb58369e4ad141 --- src/tests/JIT/opt/SVE/ConstantMasks.cs | 19 +++++++++++++++++-- .../JIT/opt/SVE/ConstantMasksOp2Fixed.cs | 12 ++++-------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/tests/JIT/opt/SVE/ConstantMasks.cs b/src/tests/JIT/opt/SVE/ConstantMasks.cs index bdf48a09c9ae92..4d2e866fde2b7a 100644 --- a/src/tests/JIT/opt/SVE/ConstantMasks.cs +++ b/src/tests/JIT/opt/SVE/ConstantMasks.cs @@ -60,6 +60,7 @@ public static void TestEntryPoint() [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbedded(Vector mask, Vector op1, Vector op2) { //ARM64-FULL-LINE: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(mask, Sve.AbsoluteDifference(op1, op2), op1); Consume(result); } @@ -69,6 +70,7 @@ static void CndSelectEmbeddedFalseMask(Vector op1, Vector op2) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.AbsoluteDifference(op1, op2), op1); Consume(result); } @@ -78,6 +80,7 @@ static void CndSelectEmbeddedZero(Vector op1, Vector op2) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AbsoluteDifference(op1, op2), op1); Consume(result); } @@ -87,6 +90,7 @@ static void CndSelectEmbeddedTrueMask(Vector op1, Vector op2) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: movprfx {{z[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.AbsoluteDifference(op1, op2), op1); Consume(result); } @@ -96,6 +100,7 @@ static void CndSelectEmbeddedAllBits(Vector op1, Vector op2) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: movprfx {{z[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AbsoluteDifference(op1, op2), op1); Consume(result); } @@ -106,6 +111,7 @@ static void CndSelectEmbeddedAllBits(Vector op1, Vector op2) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbedded(Vector mask, Vector op1, Vector op2) { //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(mask, Sve.Add(op1, op2), op1); Consume(result); } @@ -115,6 +121,7 @@ static void CndSelectOptionalEmbeddedFalseMask(Vector op1, Vector op2) //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b //ARM64-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Add(op1, op2), op1); Consume(result); } @@ -124,6 +131,7 @@ static void CndSelectOptionalEmbeddedZero(Vector op1, Vector op2) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b //ARM64-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.Add(op1, op2), op1); Consume(result); } @@ -131,6 +139,7 @@ static void CndSelectOptionalEmbeddedZero(Vector op1, Vector op2) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedTrueMask(Vector op1, Vector op2) { //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.Add(op1, op2), op1); Consume(result); } @@ -138,6 +147,7 @@ static void CndSelectOptionalEmbeddedTrueMask(Vector op1, Vector op2) [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedAllBits(Vector op1, Vector op2) { //ARM64-FULL-LINE: add {{z[0-9]+}}.s, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.Add(op1, op2), op1); Consume(result); } @@ -148,6 +158,7 @@ static void CndSelectOptionalEmbeddedAllBits(Vector op1, Vector op2) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedOneOp(Vector mask, Vector op1) { //ARM64-FULL-LINE: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(mask, Sve.Abs(op1), op1); Consume(result); } @@ -157,6 +168,7 @@ static void CndSelectEmbeddedOneOpFalseMask(Vector op1) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b //ARM64-FULL-LINE-NEXT: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Abs(op1), op1); Consume(result); } @@ -166,6 +178,7 @@ static void CndSelectEmbeddedOneOpZero(Vector op1) { //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b //ARM64-FULL-LINE-NEXT: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.Abs(op1), op1); Consume(result); } @@ -174,6 +187,7 @@ static void CndSelectEmbeddedOneOpZero(Vector op1) { static void CndSelectEmbeddedOneOpTrueMask(Vector op1) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.Abs(op1), op1); Consume(result); } @@ -182,6 +196,7 @@ static void CndSelectEmbeddedOneOpTrueMask(Vector op1) { static void CndSelectEmbeddedOneOpAllBits(Vector op1) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.Abs(op1), op1); Consume(result); } @@ -225,7 +240,7 @@ static void CndSelectEmbeddedReductionZero(Vector op1, Vector opf) { static void CndSelectEmbeddedReductionTrueMask(Vector op1, Vector opf) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt64(), Sve.AddAcross(op1), opf); Consume(result); } @@ -234,7 +249,7 @@ static void CndSelectEmbeddedReductionTrueMask(Vector op1, Vector opf static void CndSelectEmbeddedReductionAllBits(Vector op1, Vector opf) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AddAcross(op1), opf); Consume(result); } diff --git a/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.cs b/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.cs index c2b16ac691930e..ecb54cdf4358d2 100644 --- a/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.cs +++ b/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.cs @@ -316,8 +316,7 @@ static void CndSelectEmbeddedReductionZeroZ(Vector op1) { static void CndSelectEmbeddedReductionTrueMaskF(Vector op1) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt64(), Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); Consume(result); } @@ -326,8 +325,7 @@ static void CndSelectEmbeddedReductionTrueMaskF(Vector op1) { static void CndSelectEmbeddedReductionTrueMaskZ(Vector op1) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Sve.CreateTrueMaskInt64(), Sve.AddAcross(op1), Vector.Zero); Consume(result); } @@ -336,8 +334,7 @@ static void CndSelectEmbeddedReductionTrueMaskZ(Vector op1) { static void CndSelectEmbeddedReductionAllBitsF(Vector op1) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); Consume(result); } @@ -346,8 +343,7 @@ static void CndSelectEmbeddedReductionAllBitsF(Vector op1) { static void CndSelectEmbeddedReductionAllBitsZ(Vector op1) { //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE-NEXT: movz {{.*}} Vector result = Sve.ConditionalSelect(Vector.AllBitsSet, Sve.AddAcross(op1), Vector.Zero); Consume(result); } From 4568afd8319527e24461be050166e7b32280a098 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 3 Jun 2025 15:49:48 +0100 Subject: [PATCH 27/62] fix formatting --- src/coreclr/jit/codegenarm64.cpp | 4 +-- src/coreclr/jit/gentree.cpp | 19 ++++++------ src/coreclr/jit/gentree.h | 4 +-- src/coreclr/jit/hwintrinsicarm64.cpp | 3 +- src/coreclr/jit/simd.h | 46 +++++++++++++--------------- 5 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 2f51d108c7aa82..39a736d47ac26c 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2342,7 +2342,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre case GT_CNS_MSK: { GenTreeMskCon* mask = tree->AsMskCon(); - emitter* emit = GetEmitter(); + emitter* emit = GetEmitter(); // Try every type until a match is found @@ -2352,7 +2352,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre break; } - insOpts opt = INS_OPTS_SCALABLE_B; + insOpts opt = INS_OPTS_SCALABLE_B; SveMaskPattern pat = EvaluateSimdMaskPattern(TYP_BYTE, mask->gtSimdMaskVal); if (pat == SveMaskPatternNone) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index b0699047fb5ac8..c1b7eb5cd5cc6e 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32118,7 +32118,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } #if defined(FEATURE_MASKED_HW_INTRINSICS) - //Fold ConvertMaskToVector(ConvertVectorToMask(vec)) to vec + // Fold ConvertMaskToVector(ConvertVectorToMask(vec)) to vec if (tree->OperIsConvertMaskToVector()) { GenTree* op = op1; @@ -32151,7 +32151,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } } - //Fold ConvertVectorToMask(ConvertMaskToVector(mask)) to mask + // Fold ConvertVectorToMask(ConvertMaskToVector(mask)) to mask if (tree->OperIsConvertVectorToMask()) { GenTree* op = op1; @@ -33311,7 +33311,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } #endif // FEATURE_HW_INTRINSICS - GenTreeMskCon* Compiler::gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, GenTreeVecCon* vecCon) { assert(tree->OperIsConvertVectorToMask()); @@ -33397,20 +33396,24 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const } // Only a valid true mask if the parent has the same base type - return genTypeSize(ParentSimdBaseType) == genTypeSize(JitType2PreciseVarType(AsHWIntrinsic()->GetSimdBaseJitType())); + return genTypeSize(ParentSimdBaseType) == + genTypeSize(JitType2PreciseVarType(AsHWIntrinsic()->GetSimdBaseJitType())); } else if (IsCnsMsk()) { switch (parent->gtType) { case TYP_SIMD8: - return SveMaskPatternAll == EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + return SveMaskPatternAll == + EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); case TYP_SIMD12: - return SveMaskPatternAll == EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + return SveMaskPatternAll == + EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); case TYP_SIMD16: - return SveMaskPatternAll == EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + return SveMaskPatternAll == + EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); default: unreached(); @@ -33421,7 +33424,6 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const return false; } - bool GenTree::IsMaskZero() const { #ifdef TARGET_ARM64 @@ -33448,7 +33450,6 @@ bool GenTree::IsMaskZero() const return false; } - //------------------------------------------------------------------------ // gtCanSkipCovariantStoreCheck: see if storing a ref type value to an array // can skip the array store covariance check. diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index e92a0d48045871..d6fcedf27baa3c 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1818,8 +1818,8 @@ struct GenTree inline bool IsVectorCreate() const; inline bool IsVectorAllBitsSet() const; inline bool IsVectorBroadcast(var_types simdBaseType) const; - bool IsTrueMask(GenTreeHWIntrinsic* parent) const; - bool IsMaskZero() const; + bool IsTrueMask(GenTreeHWIntrinsic* parent) const; + bool IsMaskZero() const; inline uint64_t GetIntegralVectorConstElement(size_t index, var_types simdBaseType); diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 0f56413003ef56..fe669efa6676ec 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2913,7 +2913,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, default: // Invalid enum, so generate the create true mask node. - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, NI_Sve_CreateTrueMaskAll, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, NI_Sve_CreateTrueMaskAll, simdBaseJitType, + simdSize); break; } } diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 4a2a97cfa86e9c..706fe4dba84d52 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1674,32 +1674,30 @@ void EvaluateSimdCvtVectorToMask(var_types baseType, simdmask_t* result, TSimd a } } - #if defined(TARGET_ARM64) enum SveMaskPattern { - SveMaskPatternLargestPowerOf2 = 0, // The largest power of 2. - SveMaskPatternVectorCount1 = 1, // Exactly 1 element. - SveMaskPatternVectorCount2 = 2, // Exactly 2 elements. - SveMaskPatternVectorCount3 = 3, // Exactly 3 elements. - SveMaskPatternVectorCount4 = 4, // Exactly 4 elements. - SveMaskPatternVectorCount5 = 5, // Exactly 5 elements. - SveMaskPatternVectorCount6 = 6, // Exactly 6 elements. - SveMaskPatternVectorCount7 = 7, // Exactly 7 elements. - SveMaskPatternVectorCount8 = 8, // Exactly 8 elements. - SveMaskPatternVectorCount16 = 9, // Exactly 16 elements. - SveMaskPatternVectorCount32 = 10, // Exactly 32 elements. - SveMaskPatternVectorCount64 = 11, // Exactly 64 elements. - SveMaskPatternVectorCount128 = 12, // Exactly 128 elements. - SveMaskPatternVectorCount256 = 13, // Exactly 256 elements. - SveMaskPatternLargestMultipleOf4 = 29, // The largest multiple of 4. - SveMaskPatternLargestMultipleOf3 = 30, // The largest multiple of 3. - SveMaskPatternAll = 31, // All available (implicitly a multiple of two). - SveMaskPatternNone = 14 // Invalid + SveMaskPatternLargestPowerOf2 = 0, // The largest power of 2. + SveMaskPatternVectorCount1 = 1, // Exactly 1 element. + SveMaskPatternVectorCount2 = 2, // Exactly 2 elements. + SveMaskPatternVectorCount3 = 3, // Exactly 3 elements. + SveMaskPatternVectorCount4 = 4, // Exactly 4 elements. + SveMaskPatternVectorCount5 = 5, // Exactly 5 elements. + SveMaskPatternVectorCount6 = 6, // Exactly 6 elements. + SveMaskPatternVectorCount7 = 7, // Exactly 7 elements. + SveMaskPatternVectorCount8 = 8, // Exactly 8 elements. + SveMaskPatternVectorCount16 = 9, // Exactly 16 elements. + SveMaskPatternVectorCount32 = 10, // Exactly 32 elements. + SveMaskPatternVectorCount64 = 11, // Exactly 64 elements. + SveMaskPatternVectorCount128 = 12, // Exactly 128 elements. + SveMaskPatternVectorCount256 = 13, // Exactly 256 elements. + SveMaskPatternLargestMultipleOf4 = 29, // The largest multiple of 4. + SveMaskPatternLargestMultipleOf3 = 30, // The largest multiple of 3. + SveMaskPatternAll = 31, // All available (implicitly a multiple of two). + SveMaskPatternNone = 14 // Invalid }; - template SveMaskPattern EvaluateSimdMaskPattern(simdmask_t arg0) { @@ -1720,7 +1718,7 @@ SveMaskPattern EvaluateSimdMaskPattern(simdmask_t arg0) // on the corresponding mask bit bool isSet = ((mask >> (i * sizeof(TBase))) & 1) != 0; - if(!isSet) + if (!isSet) { finalOne = i; break; @@ -1736,7 +1734,7 @@ SveMaskPattern EvaluateSimdMaskPattern(simdmask_t arg0) // on the corresponding mask bit bool isSet = ((mask >> (i * sizeof(TBase))) & 1) != 0; - if(isSet) + if (isSet) { // Invalid sequence return SveMaskPatternNone; @@ -1753,8 +1751,8 @@ SveMaskPattern EvaluateSimdMaskPattern(simdmask_t arg0) } else { - //TODO: Add other patterns as required. These probably won't be seen until we get - // to wider vector lengths. + // TODO: Add other patterns as required. These probably won't be seen until we get + // to wider vector lengths. return SveMaskPatternNone; } } From 0e90437e1bf06c6b3d7d8e0b1071037f91e311d1 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 09:55:14 +0100 Subject: [PATCH 28/62] Add EvaluateSimdPatternToMask --- src/coreclr/jit/gentree.cpp | 17 ++++---- src/coreclr/jit/hwintrinsic.cpp | 8 +--- src/coreclr/jit/hwintrinsicarm64.cpp | 47 ++++++---------------- src/coreclr/jit/simd.h | 60 ++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 48 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index c1b7eb5cd5cc6e..c708b8b07a977f 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33379,6 +33379,12 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const var_types ParentSimdBaseType = JitType2PreciseVarType(parent->GetSimdBaseJitType()); #ifdef TARGET_ARM64 + static_assert_no_msg(AreContiguous(NI_Sve_CreateTrueMaskByte, NI_Sve_CreateTrueMaskDouble, + NI_Sve_CreateTrueMaskInt16, NI_Sve_CreateTrueMaskInt32, + NI_Sve_CreateTrueMaskInt64, NI_Sve_CreateTrueMaskSByte, + NI_Sve_CreateTrueMaskSingle, NI_Sve_CreateTrueMaskUInt16, + NI_Sve_CreateTrueMaskUInt32, NI_Sve_CreateTrueMaskUInt64)); + if (OperIsHWIntrinsic()) { NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); @@ -33389,15 +33395,12 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); } - // Only TrueMaskAlls will be imported - if (id != NI_Sve_CreateTrueMaskAll) - { - return false; - } + bool validTrueMask = ((id == NI_Sve_CreateTrueMaskAll) || + ((id >= NI_Sve_CreateTrueMaskByte) && (id <= NI_Sve_CreateTrueMaskUInt64))); // Only a valid true mask if the parent has the same base type - return genTypeSize(ParentSimdBaseType) == - genTypeSize(JitType2PreciseVarType(AsHWIntrinsic()->GetSimdBaseJitType())); + return (validTrueMask && (genTypeSize(ParentSimdBaseType) == + genTypeSize(JitType2PreciseVarType(AsHWIntrinsic()->GetSimdBaseJitType())))); } else if (IsCnsMsk()) { diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index ff1177922f5cce..4f7f01a49b5ed5 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -2527,13 +2527,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, if (nodeRetType == TYP_MASK) { // HWInstrinsic returns a mask, but all returns must be vectors, so convert mask to vector. - - GenTreeHWIntrinsic* op = retNode->AsHWIntrinsic(); - - CorInfoType simdBaseJitType = op->GetSimdBaseJitType(); - unsigned simdSize = op->GetSimdSize(); - - retNode = gtNewSimdCvtMaskToVectorNode(retType, op, simdBaseJitType, simdSize); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); } #endif // FEATURE_MASKED_HW_INTRINSICS && TARGET_ARM64 diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index fe669efa6676ec..7eb0619c1bee2f 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2880,49 +2880,28 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Sve_CreateTrueMaskUInt64: { assert(sig->numArgs == 1); - op1 = impPopStack().val; // TODO: For AOT, always do the gtNewSimdHWIntrinsicNode as we don't know the vector size. - // Where possible, import a constant vector to allow for optimisations. + // Where possible, import a constant mask to allow for optimisations. if (op1->IsIntegralConst()) { + simdmask_t maskVal = {}; + int64_t pattern = op1->AsIntConCommon()->IntegralValue(); - switch (pattern) - { - case 0: // POW2 - case 1: // VL1 - case 2: // VL2 - case 3: // VL3 - case 4: // VL4 - case 5: // VL5 - case 6: // VL6 - case 7: // VL7 - case 8: // VL8 - case 9: // VL16 - case 10: // VL32 - case 11: // VL64 - case 12: // VL128 - case 13: // VL256 - case 29: // MUL4 - case 30: // MUL3 - case 31: // ALL - retNode = gtNewSimdCnsVecTrueMaskPattern(retType, simdSize, simdBaseType, pattern); - break; - default: - // Invalid enum, so generate the create true mask node. - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, NI_Sve_CreateTrueMaskAll, simdBaseJitType, - simdSize); - break; + if (EvaluateSimdPatternToMask(&maskVal, simdSize, genTypeSize(simdBaseType), (SveMaskPattern)pattern)) + { + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); + mskCon->gtSimdMaskVal = maskVal; + retNode = mskCon; + break; } } - else - { - // Do not know the pattern, so generate the create true mask node. - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, NI_Sve_CreateTrueMaskAll, simdBaseJitType, simdSize); - } + + // Was not able to generate a pattern, instead import a truemaskall + retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, intrinsic, simdBaseJitType, simdSize); break; } @@ -3464,7 +3443,7 @@ GenTree* Compiler::gtNewSimdCnsVecTrueMaskPattern(var_types retType, { int64_t lanes = simdSize / genTypeSize(simdBaseType); int64_t laneBits = genTypeSize(simdBaseType) * 8; - int64_t laneVal = 1; //(laneBits > 32) ? UINT64_MAX : (((int64_t)1 << laneBits) - 1); + int64_t laneVal = 1; // Ensure the base type is integral if (simdBaseType == TYP_DOUBLE) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 706fe4dba84d52..25ecf0439390d9 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1698,6 +1698,66 @@ enum SveMaskPattern SveMaskPatternNone = 14 // Invalid }; +inline bool EvaluateSimdPatternToMask(simdmask_t* result, + int simdSize, + int simdBaseTypeSize, + SveMaskPattern pattern) +{ + uint32_t count = simdSize / simdBaseTypeSize; + uint32_t finalOne = count + 1; + uint64_t mask = 0; + + switch (pattern) + { + case SveMaskPatternLargestPowerOf2: + case SveMaskPatternAll: + finalOne = count; + break; + + case SveMaskPatternVectorCount1: + case SveMaskPatternVectorCount2: + case SveMaskPatternVectorCount3: + case SveMaskPatternVectorCount4: + case SveMaskPatternVectorCount5: + case SveMaskPatternVectorCount6: + case SveMaskPatternVectorCount7: + case SveMaskPatternVectorCount8: + finalOne = pattern - SveMaskPatternVectorCount1 + 1; + break; + + case SveMaskPatternVectorCount16: + case SveMaskPatternVectorCount32: + case SveMaskPatternVectorCount64: + case SveMaskPatternVectorCount128: + case SveMaskPatternVectorCount256: + finalOne = std::min(uint32_t(16 << (pattern - SveMaskPatternVectorCount16)), count); + break; + + case SveMaskPatternLargestMultipleOf4: + finalOne = (count - (count % 4)); + break; + + case SveMaskPatternLargestMultipleOf3: + finalOne = (count - (count % 3)); + break; + + default: + return false; + } + assert(finalOne <= count); + assert(finalOne > 0); + + // Write finalOne number of bits + for (uint32_t i = 0; i < finalOne; i++) + { + mask |= static_cast(1) << (i * simdBaseTypeSize); + } + + memcpy(&result->u8[0], &mask, sizeof(uint64_t)); + return true; +} + + template SveMaskPattern EvaluateSimdMaskPattern(simdmask_t arg0) { From 0258433ebac61617487edda524d3f776ac1c9f07 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 11:39:32 +0100 Subject: [PATCH 29/62] import vectors not masks --- src/coreclr/jit/hwintrinsicarm64.cpp | 9 ++-- src/coreclr/jit/simd.h | 66 +++++++++++++++++++++++----- 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 7eb0619c1bee2f..f8abd845070bb4 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2887,15 +2887,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, // Where possible, import a constant mask to allow for optimisations. if (op1->IsIntegralConst()) { - simdmask_t maskVal = {}; - int64_t pattern = op1->AsIntConCommon()->IntegralValue(); + simd_t simdVal; - if (EvaluateSimdPatternToMask(&maskVal, simdSize, genTypeSize(simdBaseType), (SveMaskPattern)pattern)) + if (EvaluateSimdPatternToVector(simdBaseType, &simdVal, (SveMaskPattern)pattern)) { - GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); - mskCon->gtSimdMaskVal = maskVal; - retNode = mskCon; + retNode = gtNewVconNode(retType, &simdVal); break; } } diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 25ecf0439390d9..d58af573fbe827 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1698,14 +1698,11 @@ enum SveMaskPattern SveMaskPatternNone = 14 // Invalid }; -inline bool EvaluateSimdPatternToMask(simdmask_t* result, - int simdSize, - int simdBaseTypeSize, - SveMaskPattern pattern) +template +bool EvaluateSimdPatternToVector(simd_t* result, SveMaskPattern pattern) { - uint32_t count = simdSize / simdBaseTypeSize; + uint32_t count = sizeof(TSimd) / sizeof(TBase); uint32_t finalOne = count + 1; - uint64_t mask = 0; switch (pattern) { @@ -1722,7 +1719,7 @@ inline bool EvaluateSimdPatternToMask(simdmask_t* result, case SveMaskPatternVectorCount6: case SveMaskPatternVectorCount7: case SveMaskPatternVectorCount8: - finalOne = pattern - SveMaskPatternVectorCount1 + 1; + finalOne = std::min(uint32_t(pattern - SveMaskPatternVectorCount1 + 1), count); break; case SveMaskPatternVectorCount16: @@ -1747,16 +1744,63 @@ inline bool EvaluateSimdPatternToMask(simdmask_t* result, assert(finalOne <= count); assert(finalOne > 0); - // Write finalOne number of bits - for (uint32_t i = 0; i < finalOne; i++) + // Write finalOne number of entries + for (uint32_t i = 0; i < count; i++) { - mask |= static_cast(1) << (i * simdBaseTypeSize); + TBase output; + + if (i < finalOne) + { + memset(&output, 0xFF, sizeof(TBase)); + } + else + { + memset(&output, 0x00, sizeof(TBase)); + } + + memcpy(&result->u8[i * sizeof(TBase)], &output, sizeof(TBase)); } - memcpy(&result->u8[0], &mask, sizeof(uint64_t)); return true; } +template +bool EvaluateSimdPatternToVector(var_types baseType, TSimd* result, SveMaskPattern pattern) +{ + switch (baseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + return EvaluateSimdPatternToVector(result, pattern); + } + + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + { + return EvaluateSimdPatternToVector(result, pattern); + } + + case TYP_BYTE: + case TYP_UBYTE: + { + return EvaluateSimdPatternToVector(result, pattern); + } + + case TYP_SHORT: + case TYP_USHORT: + { + return EvaluateSimdPatternToVector(result, pattern); + } + + default: + { + unreached(); + } + } +} template SveMaskPattern EvaluateSimdMaskPattern(simdmask_t arg0) From 66daf62f66fd67d936072f5df45985bf7e33f2a7 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 11:40:34 +0100 Subject: [PATCH 30/62] rename to EvaluateSimdMaskToPattern --- src/coreclr/jit/codegenarm64.cpp | 8 ++++---- src/coreclr/jit/gentree.cpp | 6 +++--- src/coreclr/jit/simd.h | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 39a736d47ac26c..56f764156e504c 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2353,24 +2353,24 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre } insOpts opt = INS_OPTS_SCALABLE_B; - SveMaskPattern pat = EvaluateSimdMaskPattern(TYP_BYTE, mask->gtSimdMaskVal); + SveMaskPattern pat = EvaluateSimdMaskToPattern(TYP_BYTE, mask->gtSimdMaskVal); if (pat == SveMaskPatternNone) { opt = INS_OPTS_SCALABLE_H; - pat = EvaluateSimdMaskPattern(TYP_SHORT, mask->gtSimdMaskVal); + pat = EvaluateSimdMaskToPattern(TYP_SHORT, mask->gtSimdMaskVal); } if (pat == SveMaskPatternNone) { opt = INS_OPTS_SCALABLE_S; - pat = EvaluateSimdMaskPattern(TYP_INT, mask->gtSimdMaskVal); + pat = EvaluateSimdMaskToPattern(TYP_INT, mask->gtSimdMaskVal); } if (pat == SveMaskPatternNone) { opt = INS_OPTS_SCALABLE_D; - pat = EvaluateSimdMaskPattern(TYP_LONG, mask->gtSimdMaskVal); + pat = EvaluateSimdMaskToPattern(TYP_LONG, mask->gtSimdMaskVal); } emit->emitIns_R_PATTERN(INS_sve_ptrue, EA_SCALABLE, targetReg, opt, (insSvePattern)pat); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index c708b8b07a977f..fe7e7ce0c398ad 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33408,15 +33408,15 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const { case TYP_SIMD8: return SveMaskPatternAll == - EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); case TYP_SIMD12: return SveMaskPatternAll == - EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); case TYP_SIMD16: return SveMaskPatternAll == - EvaluateSimdMaskPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); default: unreached(); diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index d58af573fbe827..e6970c354cbef5 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1803,7 +1803,7 @@ bool EvaluateSimdPatternToVector(var_types baseType, TSimd* result, SveMaskPatte } template -SveMaskPattern EvaluateSimdMaskPattern(simdmask_t arg0) +SveMaskPattern EvaluateSimdMaskToPattern(simdmask_t arg0) { uint32_t count = sizeof(TSimd) / sizeof(TBase); @@ -1862,7 +1862,7 @@ SveMaskPattern EvaluateSimdMaskPattern(simdmask_t arg0) } template -SveMaskPattern EvaluateSimdMaskPattern(var_types baseType, simdmask_t arg0) +SveMaskPattern EvaluateSimdMaskToPattern(var_types baseType, simdmask_t arg0) { switch (baseType) { @@ -1870,26 +1870,26 @@ SveMaskPattern EvaluateSimdMaskPattern(var_types baseType, simdmask_t arg0) case TYP_INT: case TYP_UINT: { - return EvaluateSimdMaskPattern(arg0); + return EvaluateSimdMaskToPattern(arg0); } case TYP_DOUBLE: case TYP_LONG: case TYP_ULONG: { - return EvaluateSimdMaskPattern(arg0); + return EvaluateSimdMaskToPattern(arg0); } case TYP_BYTE: case TYP_UBYTE: { - return EvaluateSimdMaskPattern(arg0); + return EvaluateSimdMaskToPattern(arg0); } case TYP_SHORT: case TYP_USHORT: { - return EvaluateSimdMaskPattern(arg0); + return EvaluateSimdMaskToPattern(arg0); } default: From 2a28c558efa5d3d11a3e9ca19a197b853506d6f3 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 11:42:38 +0100 Subject: [PATCH 31/62] Add unreached --- src/coreclr/jit/codegenarm64.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 56f764156e504c..f5e48ff2a93221 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2373,6 +2373,12 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre pat = EvaluateSimdMaskToPattern(TYP_LONG, mask->gtSimdMaskVal); } + // Should only ever create constant masks for valid patterns. + if (pat == SveMaskPatternNone) + { + unreached(); + } + emit->emitIns_R_PATTERN(INS_sve_ptrue, EA_SCALABLE, targetReg, opt, (insSvePattern)pat); break; } From 521be0bc4c0250b454ac55748b3984968bc1cbda Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 13:58:53 +0100 Subject: [PATCH 32/62] formatting --- src/coreclr/jit/gentree.cpp | 2 +- src/coreclr/jit/simd.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index fe7e7ce0c398ad..6861465d56efb2 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33400,7 +33400,7 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const // Only a valid true mask if the parent has the same base type return (validTrueMask && (genTypeSize(ParentSimdBaseType) == - genTypeSize(JitType2PreciseVarType(AsHWIntrinsic()->GetSimdBaseJitType())))); + genTypeSize(JitType2PreciseVarType(AsHWIntrinsic()->GetSimdBaseJitType())))); } else if (IsCnsMsk()) { diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index e6970c354cbef5..4a680c0ae4a707 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1701,7 +1701,7 @@ enum SveMaskPattern template bool EvaluateSimdPatternToVector(simd_t* result, SveMaskPattern pattern) { - uint32_t count = sizeof(TSimd) / sizeof(TBase); + uint32_t count = sizeof(TSimd) / sizeof(TBase); uint32_t finalOne = count + 1; switch (pattern) From 7f52e5589885377786cdd8ad1679e5238bc89217 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 14:02:33 +0100 Subject: [PATCH 33/62] fix IsTrueMask --- src/coreclr/jit/gentree.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 6861465d56efb2..ff64f508021356 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33379,12 +33379,6 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const var_types ParentSimdBaseType = JitType2PreciseVarType(parent->GetSimdBaseJitType()); #ifdef TARGET_ARM64 - static_assert_no_msg(AreContiguous(NI_Sve_CreateTrueMaskByte, NI_Sve_CreateTrueMaskDouble, - NI_Sve_CreateTrueMaskInt16, NI_Sve_CreateTrueMaskInt32, - NI_Sve_CreateTrueMaskInt64, NI_Sve_CreateTrueMaskSByte, - NI_Sve_CreateTrueMaskSingle, NI_Sve_CreateTrueMaskUInt16, - NI_Sve_CreateTrueMaskUInt32, NI_Sve_CreateTrueMaskUInt64)); - if (OperIsHWIntrinsic()) { NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); @@ -33395,12 +33389,13 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); } - bool validTrueMask = ((id == NI_Sve_CreateTrueMaskAll) || - ((id >= NI_Sve_CreateTrueMaskByte) && (id <= NI_Sve_CreateTrueMaskUInt64))); + if (id != NI_Sve_CreateTrueMaskAll) + { + return false; + } // Only a valid true mask if the parent has the same base type - return (validTrueMask && (genTypeSize(ParentSimdBaseType) == - genTypeSize(JitType2PreciseVarType(AsHWIntrinsic()->GetSimdBaseJitType())))); + return (genTypeSize(ParentSimdBaseType) == genTypeSize(JitType2PreciseVarType(AsHWIntrinsic()->GetSimdBaseJitType()))); } else if (IsCnsMsk()) { From c74f35d1f8ff1ee2009abfafedc4763ec86004b6 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 14:08:56 +0100 Subject: [PATCH 34/62] remove emb op fix --- src/coreclr/jit/gentree.cpp | 10 +--------- src/coreclr/jit/gentree.h | 12 ------------ 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ff64f508021356..4f66386c2d8765 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7218,7 +7218,7 @@ bool GenTree::OperMayThrow(Compiler* comp) return true; } -#if defined(TARGET_XARCH) +#ifdef TARGET_XARCH NamedIntrinsic intrinsicId = this->AsHWIntrinsic()->GetHWIntrinsicId(); if (intrinsicId == NI_Vector128_op_Division || intrinsicId == NI_Vector256_op_Division || intrinsicId == NI_Vector512_op_Division) @@ -7226,14 +7226,6 @@ bool GenTree::OperMayThrow(Compiler* comp) assert(varTypeIsInt(AsHWIntrinsic()->GetSimdBaseType())); return true; } -#elif defined(TARGET_ARM64) - NamedIntrinsic intrinsicId = this->AsHWIntrinsic()->GetHWIntrinsicId(); - if (intrinsicId == NI_Sve_ConditionalSelect) - { - // If op2 is embedded, then check if that will throw. - GenTree* op2 = this->AsHWIntrinsic()->Op(2); - return (op2->IsEmbMaskOp() && op2->OperMayThrow(comp)); - } #endif // TARGET_XARCH } #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index d6fcedf27baa3c..41f01f8b5729b0 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -2076,12 +2076,6 @@ struct GenTree assert(IsValue()); gtFlags &= ~GTF_CONTAINED; ClearRegOptional(); -#ifdef FEATURE_HW_INTRINSICS - if (OperIsHWIntrinsic()) - { - ClearEmbMaskOp(); - } -#endif } bool CanCSE() const @@ -2304,12 +2298,6 @@ struct GenTree gtFlags |= GTF_HW_EM_OP; } - void ClearEmbMaskOp() - { - assert(OperIsHWIntrinsic()); - gtFlags &= ~GTF_HW_EM_OP; - } - #endif // FEATURE_HW_INTRINSICS static bool HandleKindDataIsInvariant(GenTreeFlags flags); From ae216da6d59aae62fd2c25c40c59b46e5cd5e2fb Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 15:06:35 +0100 Subject: [PATCH 35/62] fix morphing errors --- src/coreclr/jit/gentree.cpp | 9 ++++++--- src/coreclr/jit/gentree.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 4f66386c2d8765..e1613a1d6f5301 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33249,7 +33249,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) return op2; } - if (op1->IsVectorZero()) + if (op1->IsVectorZero() || op1->IsFalseMask()) { return gtWrapWithSideEffects(op3, op2, GTF_ALL_EFFECT); } @@ -33414,7 +33414,7 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const return false; } -bool GenTree::IsMaskZero() const +bool GenTree::IsFalseMask() const { #ifdef TARGET_ARM64 static_assert_no_msg(AreContiguous(NI_Sve_CreateFalseMaskByte, NI_Sve_CreateFalseMaskDouble, @@ -33435,7 +33435,10 @@ bool GenTree::IsMaskZero() const return ((id == NI_Sve_CreateFalseMaskAll) || ((id >= NI_Sve_CreateFalseMaskByte) && (id <= NI_Sve_CreateFalseMaskUInt64))); } - + else if (IsCnsMsk()) + { + return AsMskCon()->IsZero(); + } #endif return false; } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 41f01f8b5729b0..f89ceac47b2ef1 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1819,7 +1819,7 @@ struct GenTree inline bool IsVectorAllBitsSet() const; inline bool IsVectorBroadcast(var_types simdBaseType) const; bool IsTrueMask(GenTreeHWIntrinsic* parent) const; - bool IsMaskZero() const; + bool IsFalseMask() const; inline uint64_t GetIntegralVectorConstElement(size_t index, var_types simdBaseType); From b2075cd2c40c1738017a2b3a0950e3481aa21a4f Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 15:14:59 +0100 Subject: [PATCH 36/62] Remove NI_Sve_CreateFalseMaskAll --- src/coreclr/jit/gentree.cpp | 9 +-------- src/coreclr/jit/hwintrinsicarm64.cpp | 2 ++ src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 4 ---- src/coreclr/jit/hwintrinsiclistarm64sve.h | 1 - 4 files changed, 3 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index e1613a1d6f5301..bea4cd790dc956 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33417,12 +33417,6 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const bool GenTree::IsFalseMask() const { #ifdef TARGET_ARM64 - static_assert_no_msg(AreContiguous(NI_Sve_CreateFalseMaskByte, NI_Sve_CreateFalseMaskDouble, - NI_Sve_CreateFalseMaskInt16, NI_Sve_CreateFalseMaskInt32, - NI_Sve_CreateFalseMaskInt64, NI_Sve_CreateFalseMaskSByte, - NI_Sve_CreateFalseMaskSingle, NI_Sve_CreateFalseMaskUInt16, - NI_Sve_CreateFalseMaskUInt32, NI_Sve_CreateFalseMaskUInt64)); - if (OperIsHWIntrinsic()) { NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); @@ -33432,8 +33426,7 @@ bool GenTree::IsFalseMask() const assert(op1->OperIsHWIntrinsic()); id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); } - return ((id == NI_Sve_CreateFalseMaskAll) || - ((id >= NI_Sve_CreateFalseMaskByte) && (id <= NI_Sve_CreateFalseMaskUInt64))); + return (id == NI_Sve_CreateFalseMaskByte); } else if (IsCnsMsk()) { diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index f8abd845070bb4..119cb5909002d0 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -3404,6 +3404,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, // GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize) { + // TODO: This should import constant vector all bits set return gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_CreateTrueMaskAll, simdBaseJitType, simdSize); } @@ -3418,6 +3419,7 @@ GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigne // GenTree* Compiler::gtNewSimdFalseMaskByteNode(unsigned simdSize) { + // TODO: This should import constant vector 0 return gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_CreateFalseMaskByte, CORINFO_TYPE_UBYTE, simdSize); } diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index e553cea06f8d60..3a6ec340e29aed 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -2033,10 +2033,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_Sve_CreateFalseMaskAll: - GetEmitter()->emitInsSve_R(ins, emitSize, targetReg, opt); - break; - case NI_Sve_CreateTrueMaskAll: // Must use the pattern variant, as the non-pattern varient is SVE2.1. GetEmitter()->emitIns_R_PATTERN(ins, emitSize, targetReg, opt, SVE_PATTERN_ALL); diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h index d3527771f22dcc..4b2112976df00b 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64sve.h +++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h @@ -326,7 +326,6 @@ HARDWARE_INTRINSIC(Sve, ConditionalExtractAfterLastActiveElementScalar HARDWARE_INTRINSIC(Sve, ConditionalExtractLastActiveElementScalar, 0, 3, {INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_HasRMWSemantics|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ConvertMaskToVector, -1, 1, {INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov}, HW_Category_Helper, HW_Flag_Scalable) HARDWARE_INTRINSIC(Sve, ConvertVectorToMask, -1, 2, {INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne}, HW_Category_Helper, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_LowMaskedOperation) -HARDWARE_INTRINSIC(Sve, CreateFalseMaskAll, -1, 0, {INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse, INS_sve_pfalse}, HW_Category_Helper, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, CreateTrueMaskAll, -1, 0, {INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue}, HW_Category_Helper, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) // Scalar variants of Saturating*By*BitElementCount. There is 8bit versions as the generic version is scalar only. HARDWARE_INTRINSIC(Sve, SaturatingDecrementBy16BitElementCountScalar, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sqdech, INS_sve_uqdech, INS_sve_sqdech, INS_sve_uqdech, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_HasRMWSemantics) From 52aaa721476c80fd5e993e1ed17815c157d97fad Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 17:38:49 +0100 Subject: [PATCH 37/62] rename TrueMaskAll to ConversionTrueMask and only use as such --- src/coreclr/jit/gentree.cpp | 37 +++++---------------- src/coreclr/jit/hwintrinsicarm64.cpp | 18 +++++++--- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 2 +- src/coreclr/jit/hwintrinsiclistarm64sve.h | 3 +- 4 files changed, 26 insertions(+), 34 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index bea4cd790dc956..3a22132e79d101 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -22054,8 +22054,8 @@ GenTree* Compiler::gtNewSimdCvtVectorToMaskNode(var_types type, #if defined(TARGET_XARCH) return gtNewSimdHWIntrinsicNode(TYP_MASK, op1, NI_EVEX_ConvertVectorToMask, simdBaseJitType, simdSize); #elif defined(TARGET_ARM64) - // We use cmpne which requires an embedded mask. - GenTree* trueMask = gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); + // ConvertVectorToMask uses cmpne which requires an embedded mask. + GenTree* trueMask = gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_ConversionTrueMask, simdBaseJitType, simdSize); return gtNewSimdHWIntrinsicNode(TYP_MASK, trueMask, op1, NI_Sve_ConvertVectorToMask, simdBaseJitType, simdSize); #else #error Unsupported platform @@ -32152,11 +32152,9 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) #if defined(TARGET_XARCH) tryHandle = op->OperIsHWIntrinsic(); #elif defined(TARGET_ARM64) - if (op->OperIsHWIntrinsic(NI_Sve_CreateTrueMaskAll)) - { - op = op2; - tryHandle = op->OperIsHWIntrinsic(); - } + assert(op->OperIsHWIntrinsic(NI_Sve_ConversionTrueMask)); + op = op2; + tryHandle = op->OperIsHWIntrinsic(); #endif // TARGET_ARM64 if (tryHandle) @@ -33368,28 +33366,10 @@ GenTreeMskCon* Compiler::gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, // bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const { - var_types ParentSimdBaseType = JitType2PreciseVarType(parent->GetSimdBaseJitType()); - #ifdef TARGET_ARM64 - if (OperIsHWIntrinsic()) - { - NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); - if (id == NI_Sve_ConvertMaskToVector) - { - GenTree* op1 = AsHWIntrinsic()->Op(1); - assert(op1->OperIsHWIntrinsic()); - id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); - } - - if (id != NI_Sve_CreateTrueMaskAll) - { - return false; - } + var_types ParentSimdBaseType = JitType2PreciseVarType(parent->GetSimdBaseJitType()); - // Only a valid true mask if the parent has the same base type - return (genTypeSize(ParentSimdBaseType) == genTypeSize(JitType2PreciseVarType(AsHWIntrinsic()->GetSimdBaseJitType()))); - } - else if (IsCnsMsk()) + if (IsCnsMsk()) { switch (parent->gtType) { @@ -33409,8 +33389,8 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const unreached(); } } - #endif + return false; } @@ -33433,6 +33413,7 @@ bool GenTree::IsFalseMask() const return AsMskCon()->IsZero(); } #endif + return false; } diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 119cb5909002d0..ea6dd58ceff7fb 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -3404,8 +3404,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, // GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize) { - // TODO: This should import constant vector all bits set - return gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_CreateTrueMaskAll, simdBaseJitType, simdSize); + // Import as a constant vector all bits set + + var_types simdType = getSIMDTypeForSize(simdSize); + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + + simd_t simdVal; + bool found = EvaluateSimdPatternToVector(simdBaseType, &simdVal, SveMaskPatternAll); + assert(found); + + return gtNewVconNode(simdType, &simdVal); } //------------------------------------------------------------------------ @@ -3419,8 +3427,10 @@ GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigne // GenTree* Compiler::gtNewSimdFalseMaskByteNode(unsigned simdSize) { - // TODO: This should import constant vector 0 - return gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_CreateFalseMaskByte, CORINFO_TYPE_UBYTE, simdSize); + // Import as a constant vector 0 + GenTreeVecCon* vecCon = gtNewVconNode(getSIMDTypeForSize(simdSize)); + vecCon->gtSimdVal = simd_t::Zero(); + return vecCon; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 3a6ec340e29aed..e3fb77459446b9 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -2033,7 +2033,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_Sve_CreateTrueMaskAll: + case NI_Sve_ConversionTrueMask: // Must use the pattern variant, as the non-pattern varient is SVE2.1. GetEmitter()->emitIns_R_PATTERN(ins, emitSize, targetReg, opt, SVE_PATTERN_ALL); break; diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h index 4b2112976df00b..51b4f642ac1e44 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64sve.h +++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h @@ -326,7 +326,8 @@ HARDWARE_INTRINSIC(Sve, ConditionalExtractAfterLastActiveElementScalar HARDWARE_INTRINSIC(Sve, ConditionalExtractLastActiveElementScalar, 0, 3, {INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_HasRMWSemantics|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ConvertMaskToVector, -1, 1, {INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov, INS_sve_mov}, HW_Category_Helper, HW_Flag_Scalable) HARDWARE_INTRINSIC(Sve, ConvertVectorToMask, -1, 2, {INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne, INS_sve_cmpne}, HW_Category_Helper, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_LowMaskedOperation) -HARDWARE_INTRINSIC(Sve, CreateTrueMaskAll, -1, 0, {INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue}, HW_Category_Helper, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) +// True mask only used inside a ConvertVectorToMask +HARDWARE_INTRINSIC(Sve, ConversionTrueMask, -1, 0, {INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue, INS_sve_ptrue}, HW_Category_Helper, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) // Scalar variants of Saturating*By*BitElementCount. There is 8bit versions as the generic version is scalar only. HARDWARE_INTRINSIC(Sve, SaturatingDecrementBy16BitElementCountScalar, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sqdech, INS_sve_uqdech, INS_sve_sqdech, INS_sve_uqdech, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, SaturatingDecrementBy32BitElementCountScalar, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sqdecw, INS_sve_uqdecw, INS_sve_sqdecw, INS_sve_uqdecw, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_HasRMWSemantics) From bd6da1992814c4627dd1480845728024a967cd08 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 4 Jun 2025 17:56:16 +0100 Subject: [PATCH 38/62] remove gtNewSimdCnsVecTrueMaskPattern --- src/coreclr/jit/compiler.h | 1 - src/coreclr/jit/hwintrinsicarm64.cpp | 83 ---------------------------- src/coreclr/jit/instr.h | 3 +- 3 files changed, 1 insertion(+), 86 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 6d1fa2de97222d..cd4dc044e47a7d 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3134,7 +3134,6 @@ class Compiler #if defined(TARGET_ARM64) GenTree* gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize); GenTree* gtNewSimdFalseMaskByteNode(unsigned simdSize); - GenTree* gtNewSimdCnsVecTrueMaskPattern(var_types retType, int simdSize, var_types simdBaseType, int64_t pattern); #endif GenTree* gtNewSimdBinOpNode(genTreeOps op, diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index ea6dd58ceff7fb..6dba038c88bd62 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -3433,87 +3433,4 @@ GenTree* Compiler::gtNewSimdFalseMaskByteNode(unsigned simdSize) return vecCon; } -//------------------------------------------------------------------------ -// gtNewSimdCnsVecTrueMaskPattern: Create a constant vector with a true mask bit pattern -// -// Arguments: -// retType -- return type of the intrinsic. -// simdSize -- the simd size of the nodes being created -// simdBaseJitType -- the base jit type of the nodes being created -// pattern -- The pattern to use as defined by the Arm PTRUE instruction -// -// Return Value: -// The node -// -GenTree* Compiler::gtNewSimdCnsVecTrueMaskPattern(var_types retType, - int simdSize, - var_types simdBaseType, - int64_t pattern) -{ - int64_t lanes = simdSize / genTypeSize(simdBaseType); - int64_t laneBits = genTypeSize(simdBaseType) * 8; - int64_t laneVal = 1; - - // Ensure the base type is integral - if (simdBaseType == TYP_DOUBLE) - { - simdBaseType = TYP_ULONG; - } - else if (simdBaseType == TYP_FLOAT) - { - simdBaseType = TYP_UINT; - } - - GenTreeVecCon* vecCon = gtNewVconNode(retType); - - int64_t lanesToFill = 0; - switch (pattern) - { - case 0: // POW2 - The largest power of 2 - case 31: // ALL - All lanes - lanesToFill = lanes; - break; - - case 1: // VL1 - exactly 1 lane, etc - case 2: // VL2 - case 3: // VL3 - case 4: // VL4 - case 5: // VL5 - case 6: // VL6 - case 7: // VL7 - case 8: // VL8 - lanesToFill = pattern; - break; - - case 9: // VL16 - exactly 16 lanes, etc - case 10: // VL32 - case 11: // VL64 - case 12: // VL128 - case 13: // VL256 - lanesToFill = ((pattern - 8) * 16); - break; - - case 29: // MUL4 - The largest multiple of 4 - lanesToFill = (lanes - (lanes % 4)); - break; - - case 30: // MUL3 - The largest multiple of 3 - lanesToFill = (lanes - (lanes % 3)); - break; - - default: - assert(false); - break; - } - - lanesToFill = std::min(lanesToFill, lanes); - - for (int index = 0; index < lanesToFill; index++) - { - vecCon->SetElementIntegral(simdBaseType, index, laneVal); - } - - return vecCon; -} - #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 06995c7615d84c..d7aa4d21bcebd2 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -440,8 +440,7 @@ enum insSvePattern : unsigned SVE_PATTERN_VL256 = 13, // 256 elements. SVE_PATTERN_MUL4 = 29, // The largest multiple of 4. SVE_PATTERN_MUL3 = 30, // The largest multiple of 3. - SVE_PATTERN_ALL = 31, // All available (implicitly a multiple of two). - SVE_PATTERN_INVALID = 14 + SVE_PATTERN_ALL = 31 // All available (implicitly a multiple of two). }; // Prefetch operation specifier for SVE instructions such as prfb. From bbacdbf69b49ce176a08e2fe003c5abf0d26f9e9 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 5 Jun 2025 11:00:30 +0100 Subject: [PATCH 39/62] Switch gtNewSimdAllTrueMaskNode to create constant mask --- src/coreclr/jit/hwintrinsicarm64.cpp | 49 ++++++++++---- src/coreclr/jit/simd.h | 95 ++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 6dba038c88bd62..c7ff81fe3909c8 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -3393,7 +3393,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } //------------------------------------------------------------------------ -// gtNewSimdAllTrueMaskNode: Create an embedded mask with all bits set to true +// gtNewSimdAllTrueMaskNode: Create a mask with all bits set to true // // Arguments: // simdBaseJitType -- the base jit type of the nodes being masked @@ -3404,20 +3404,45 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, // GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize) { - // Import as a constant vector all bits set + // Import as a constant mask - var_types simdType = getSIMDTypeForSize(simdSize); - var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + var_types simdType = getSIMDTypeForSize(simdSize); + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + bool found = false; + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); + + switch (simdType) + { + case TYP_SIMD8: + { + found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); + break; + } + + case TYP_SIMD12: + { + found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); + break; + } - simd_t simdVal; - bool found = EvaluateSimdPatternToVector(simdBaseType, &simdVal, SveMaskPatternAll); + case TYP_SIMD16: + { + found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); + break; + } + + default: + { + unreached(); + } + } assert(found); - return gtNewVconNode(simdType, &simdVal); + return mskCon; } //------------------------------------------------------------------------ -// gtNewSimdFalseMaskByteNode: Create an embedded mask with all bits set to false +// gtNewSimdFalseMaskByteNode: Create a mask with all bits set to false // // Arguments: // simdSize -- the simd size of the nodes being masked @@ -3427,10 +3452,10 @@ GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigne // GenTree* Compiler::gtNewSimdFalseMaskByteNode(unsigned simdSize) { - // Import as a constant vector 0 - GenTreeVecCon* vecCon = gtNewVconNode(getSIMDTypeForSize(simdSize)); - vecCon->gtSimdVal = simd_t::Zero(); - return vecCon; + // Import as a constant mask 0 + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); + mskCon->gtSimdMaskVal = simdmask_t::Zero(); + return mskCon; } #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 4a680c0ae4a707..9841bdeb38c93c 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -1698,6 +1698,101 @@ enum SveMaskPattern SveMaskPatternNone = 14 // Invalid }; +template +bool EvaluateSimdPatternToMask(simdmask_t* result, SveMaskPattern pattern) +{ + uint32_t count = sizeof(TSimd) / sizeof(TBase); + uint32_t finalOne = count + 1; + uint64_t mask = 0; + + switch (pattern) + { + case SveMaskPatternLargestPowerOf2: + case SveMaskPatternAll: + finalOne = count; + break; + + case SveMaskPatternVectorCount1: + case SveMaskPatternVectorCount2: + case SveMaskPatternVectorCount3: + case SveMaskPatternVectorCount4: + case SveMaskPatternVectorCount5: + case SveMaskPatternVectorCount6: + case SveMaskPatternVectorCount7: + case SveMaskPatternVectorCount8: + finalOne = pattern - SveMaskPatternVectorCount1 + 1; + break; + + case SveMaskPatternVectorCount16: + case SveMaskPatternVectorCount32: + case SveMaskPatternVectorCount64: + case SveMaskPatternVectorCount128: + case SveMaskPatternVectorCount256: + finalOne = std::min(uint32_t(16 << (pattern - SveMaskPatternVectorCount16)), count); + break; + + case SveMaskPatternLargestMultipleOf4: + finalOne = (count - (count % 4)); + break; + + case SveMaskPatternLargestMultipleOf3: + finalOne = (count - (count % 3)); + break; + + default: + return false; + } + assert(finalOne <= count); + assert(finalOne > 0); + + // Write finalOne number of bits + for (uint32_t i = 0; i < finalOne; i++) + { + mask |= static_cast(1) << (i * sizeof(TBase)); + } + + memcpy(&result->u8[0], &mask, sizeof(uint64_t)); + return true; +} + +template +bool EvaluateSimdPatternToMask(var_types baseType, simdmask_t* result, SveMaskPattern pattern) +{ + switch (baseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + return EvaluateSimdPatternToMask(result, pattern); + } + + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + { + return EvaluateSimdPatternToMask(result, pattern); + } + + case TYP_BYTE: + case TYP_UBYTE: + { + return EvaluateSimdPatternToMask(result, pattern); + } + + case TYP_SHORT: + case TYP_USHORT: + { + return EvaluateSimdPatternToMask(result, pattern); + } + + default: + { + unreached(); + } + } +} + template bool EvaluateSimdPatternToVector(simd_t* result, SveMaskPattern pattern) { From 06e693a69f1f9f4330384a19918a799e0d6ddd0c Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 5 Jun 2025 12:30:20 +0100 Subject: [PATCH 40/62] fix tests --- src/tests/JIT/opt/SVE/ConstantMasks.cs | 57 ++++++------------- .../JIT/opt/SVE/ConstantMasksOp2Fixed.cs | 57 +++---------------- 2 files changed, 24 insertions(+), 90 deletions(-) diff --git a/src/tests/JIT/opt/SVE/ConstantMasks.cs b/src/tests/JIT/opt/SVE/ConstantMasks.cs index 4d2e866fde2b7a..078e60e9b55411 100644 --- a/src/tests/JIT/opt/SVE/ConstantMasks.cs +++ b/src/tests/JIT/opt/SVE/ConstantMasks.cs @@ -42,8 +42,8 @@ public static void TestEntryPoint() CndSelectOptionalEmbeddedAllBits(op1, op2); CndSelectEmbeddedOneOp(op1, op2); - CndSelectEmbeddedOneOpFalseMask(op1); - CndSelectEmbeddedOneOpZero(op1); + CndSelectEmbeddedOneOpFalseMask(op1, op2); + CndSelectEmbeddedOneOpZero(op1, op2); CndSelectEmbeddedOneOpTrueMask(op1); CndSelectEmbeddedOneOpAllBits(op1); @@ -67,21 +67,15 @@ static void CndSelectEmbedded(Vector mask, Vector op1, Vector op2 [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedFalseMask(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movz {{.*}} - Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.AbsoluteDifference(op1, op2), op1); + //ARM64-FULL-LINE: mov v0.16b, v1.16b + Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.AbsoluteDifference(op1, op2), op2); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedZero(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movz {{.*}} - Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AbsoluteDifference(op1, op2), op1); + //ARM64-FULL-LINE: mov v0.16b, v1.16b + Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AbsoluteDifference(op1, op2), op2); Consume(result); } @@ -118,21 +112,15 @@ static void CndSelectOptionalEmbedded(Vector mask, Vector op1, Vector< [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedFalseMask(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movz {{.*}} - Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Add(op1, op2), op1); + //ARM64-FULL-LINE: mov v0.16b, v1.16b + Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Add(op1, op2), op2); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedZero(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movz {{.*}} - Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.Add(op1, op2), op1); + //ARM64-FULL-LINE: mov v0.16b, v1.16b + Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.Add(op1, op2), op2); Consume(result); } @@ -164,21 +152,15 @@ static void CndSelectEmbeddedOneOp(Vector mask, Vector op1) { } [MethodImpl(MethodImplOptions.NoInlining)] - static void CndSelectEmbeddedOneOpFalseMask(Vector op1) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movz {{.*}} + static void CndSelectEmbeddedOneOpFalseMask(Vector dummy, Vector op1) { + //ARM64-FULL-LINE: mov v0.16b, v1.16b Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Abs(op1), op1); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] - static void CndSelectEmbeddedOneOpZero(Vector op1) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: abs {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movz {{.*}} + static void CndSelectEmbeddedOneOpZero(Vector dummy, Vector op1) { + //ARM64-FULL-LINE: mov v0.16b, v1.16b Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.Abs(op1), op1); Consume(result); } @@ -217,21 +199,14 @@ static void CndSelectEmbeddedReduction(Vector mask, Vector op1, Vecto [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionFalseMask(Vector op1, Vector opf) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE: mov v0.16b, v1.16b Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt64(), Sve.AddAcross(op1), opf); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionZero(Vector op1, Vector opf) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE: mov v0.16b, v1.16b Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AddAcross(op1), opf); Consume(result); } diff --git a/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.cs b/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.cs index ecb54cdf4358d2..ba23ebe08f07c9 100644 --- a/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.cs +++ b/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.cs @@ -82,38 +82,28 @@ static void CndSelectEmbeddedZ(Vector mask, Vector op1, Vector op [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedFalseMaskF(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM6-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 var result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.AbsoluteDifference(op1, op2), Sve.CreateFalseMaskInt32()); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedFalseMaskZ(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM6-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 var result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.AbsoluteDifference(op1, op2), Vector.Zero); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedZeroF(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 - //ARM6-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 - //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s var result = Sve.ConditionalSelect(Vector.Zero, Sve.AbsoluteDifference(op1, op2), Sve.CreateFalseMaskInt32()); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedZeroZ(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 - //ARM6-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 - //ARM6-FULL-LINE-NEXT: sabd {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s var result = Sve.ConditionalSelect(Vector.Zero, Sve.AbsoluteDifference(op1, op2), Vector.Zero); Consume(result); } @@ -155,10 +145,6 @@ static void CndSelectEmbeddedAllBitsZ(Vector op1, Vector op2) { Consume(result); } - // SVE one op operation (with embedded mask) inside a conditional select - -///...... - // SVE operation (with optional embedded mask) inside a conditional select @@ -178,38 +164,28 @@ static void CndSelectOptionalEmbeddedZ(Vector mask, Vector op1, Vector [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedFalseMaskF(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM6-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM6-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 var result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Add(op1, op2), Sve.CreateFalseMaskInt32()); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedFalseMaskZ(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM6-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM6-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s + //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 var result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt32(), Sve.Add(op1, op2), Vector.Zero); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedZeroF(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 - //ARM6-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 - //ARM6-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s var result = Sve.ConditionalSelect(Vector.Zero, Sve.Add(op1, op2), Sve.CreateFalseMaskInt32()); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectOptionalEmbeddedZeroZ(Vector op1, Vector op2) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 //ARM6-FULL-LINE: movi {{v[0-9]+}}.4s, #0 - //ARM6-FULL-LINE-NEXT: cmpne {{p[0-9]+}}.s, {{p[0-9]+}}/z, {{z[0-9]+}}.s, #0 - //ARM6-FULL-LINE-NEXT: add {{z[0-9]+}}.s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s var result = Sve.ConditionalSelect(Vector.Zero, Sve.Add(op1, op2), Vector.Zero); Consume(result); } @@ -269,45 +245,28 @@ static void CndSelectEmbeddedReductionZ(Vector mask, Vector op1) { [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionFalseMaskF(Vector op1) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE: movi v0.4s, #0 Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt64(), Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionFalseMaskZ(Vector op1) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: pfalse {{p[0-9]+}}.b - //ARM64-FULL-LINE-NEXT: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE: movi v0.4s, #0 Vector result = Sve.ConditionalSelect(Sve.CreateFalseMaskInt64(), Sve.AddAcross(op1), Vector.Zero); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionZeroF(Vector op1) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE: movi v0.4s, #0 Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AddAcross(op1), Sve.CreateFalseMaskInt64()); Consume(result); } [MethodImpl(MethodImplOptions.NoInlining)] static void CndSelectEmbeddedReductionZeroZ(Vector op1) { - //ARMSVE-TODO: This could be optimised to remove both instructions #114433 - //ARM64-FULL-LINE: ptrue {{p[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: saddv {{d[0-9]+}}, {{p[0-9]+}}, {{z[0-9]+}}.s - //ARM64-FULL-LINE-NEXT: movi {{v[0-9]+}}.4s, #0 - //ARM64-FULL-LINE-NEXT: sel {{z[0-9]+}}.d, {{p[0-9]+}}, {{z[0-9]+}}.d, {{z[0-9]+}}.d + //ARM64-FULL-LINE: movi v0.4s, #0 Vector result = Sve.ConditionalSelect(Vector.Zero, Sve.AddAcross(op1), Vector.Zero); Consume(result); } From eb8ca77391ae1658a2704907b5148936d7e2447f Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 5 Jun 2025 13:24:18 +0100 Subject: [PATCH 41/62] FEATURE_HW_INTRINSICS checks --- src/coreclr/jit/gentree.cpp | 7 ++++--- src/coreclr/jit/gentree.h | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 3a22132e79d101..7739f3faa68cb7 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32245,11 +32245,11 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) { switch (ni) { -#if defined(TARGET_AMD64) +#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) case NI_EVEX_ConvertVectorToMask: resultNode = gtFoldExprConvertVecCnsToMask(tree, vecCon); break; -#endif // TARGET_AMD64 +#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS #ifdef TARGET_ARM64 case NI_ArmBase_LeadingZeroCount: @@ -33299,7 +33299,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } return resultNode; } -#endif // FEATURE_HW_INTRINSICS GenTreeMskCon* Compiler::gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, GenTreeVecCon* vecCon) { @@ -33417,6 +33416,8 @@ bool GenTree::IsFalseMask() const return false; } +#endif // FEATURE_HW_INTRINSICS + //------------------------------------------------------------------------ // gtCanSkipCovariantStoreCheck: see if storing a ref type value to an array // can skip the array store covariance check. diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index f89ceac47b2ef1..b28b7416f2c455 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1818,8 +1818,11 @@ struct GenTree inline bool IsVectorCreate() const; inline bool IsVectorAllBitsSet() const; inline bool IsVectorBroadcast(var_types simdBaseType) const; + +#ifdef FEATURE_HW_INTRINSICS bool IsTrueMask(GenTreeHWIntrinsic* parent) const; bool IsFalseMask() const; +#endif inline uint64_t GetIntegralVectorConstElement(size_t index, var_types simdBaseType); From dd85b76aa41e59613fd2f62f5acc2aa60d7efe56 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 5 Jun 2025 13:44:06 +0100 Subject: [PATCH 42/62] formatting --- src/coreclr/jit/gentree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index b28b7416f2c455..29e8be65dfce5e 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1820,8 +1820,8 @@ struct GenTree inline bool IsVectorBroadcast(var_types simdBaseType) const; #ifdef FEATURE_HW_INTRINSICS - bool IsTrueMask(GenTreeHWIntrinsic* parent) const; - bool IsFalseMask() const; + bool IsTrueMask(GenTreeHWIntrinsic* parent) const; + bool IsFalseMask() const; #endif inline uint64_t GetIntegralVectorConstElement(size_t index, var_types simdBaseType); From e257bf1d9470db080ad76163311fed4aeeb8cfc1 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 5 Jun 2025 13:46:31 +0100 Subject: [PATCH 43/62] fix gtFoldExprConvertVecCnsToMask call --- src/coreclr/jit/gentree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 7739f3faa68cb7..f6351139c3b32b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32247,7 +32247,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) { #if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) case NI_EVEX_ConvertVectorToMask: - resultNode = gtFoldExprConvertVecCnsToMask(tree, vecCon); + resultNode = gtFoldExprConvertVecCnsToMask(tree, cnsNode->AsVecCon()); break; #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS From d6249bc13d70d904c3b1bb5f073c84bf9cbd6a9d Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 5 Jun 2025 14:10:55 +0100 Subject: [PATCH 44/62] move gtFoldExprConvertVecCnsToMask call --- src/coreclr/jit/gentree.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index f6351139c3b32b..d41546b3c0485d 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32240,17 +32240,17 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) resultNode = gtNewVconNode(retType, &simdVal); } +#if defined(TARGET_XARCH) + else if (tree->OperIsConvertVectorToMask()) + { + resultNode = gtFoldExprConvertVecCnsToMask(tree, cnsNode->AsVecCon()); + } +#endif // TARGET_XARCH #endif // FEATURE_MASKED_HW_INTRINSICS else { switch (ni) { -#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) - case NI_EVEX_ConvertVectorToMask: - resultNode = gtFoldExprConvertVecCnsToMask(tree, cnsNode->AsVecCon()); - break; -#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS - #ifdef TARGET_ARM64 case NI_ArmBase_LeadingZeroCount: #else From 1111249aa32bfc0f8239e4043aa2010bbf561858 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Thu, 5 Jun 2025 15:00:02 +0100 Subject: [PATCH 45/62] Allow for masks being input to mask nodes --- src/coreclr/jit/gentree.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index d41546b3c0485d..ca961121157efc 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33384,6 +33384,10 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const return SveMaskPatternAll == EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + case TYP_MASK: + // The mask parent does not have a vector type, so we cannot evaluate the size of the vector. + return false; + default: unreached(); } From 712bf3ed229a110062025f398837c4713e927024 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 6 Jun 2025 16:23:29 +0100 Subject: [PATCH 46/62] use IsFalseMask everywhere --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/hwintrinsicarm64.cpp | 5 +---- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 12 ++++++------ src/coreclr/jit/lowerarmarch.cpp | 14 +++++++------- src/coreclr/jit/morph.cpp | 2 +- 5 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index cd4dc044e47a7d..bf1fc95c876191 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3133,7 +3133,7 @@ class Compiler #if defined(TARGET_ARM64) GenTree* gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize); - GenTree* gtNewSimdFalseMaskByteNode(unsigned simdSize); + GenTree* gtNewSimdFalseMaskByteNode(); #endif GenTree* gtNewSimdBinOpNode(genTreeOps op, diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index c7ff81fe3909c8..e3bb88a044eb83 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -3444,13 +3444,10 @@ GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigne //------------------------------------------------------------------------ // gtNewSimdFalseMaskByteNode: Create a mask with all bits set to false // -// Arguments: -// simdSize -- the simd size of the nodes being masked -// // Return Value: // The mask // -GenTree* Compiler::gtNewSimdFalseMaskByteNode(unsigned simdSize) +GenTree* Compiler::gtNewSimdFalseMaskByteNode() { // Import as a constant mask 0 GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index e3fb77459446b9..10c148407e8247 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -498,7 +498,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // Shared code for setting up embedded mask arg for intrinsics with 3+ operands auto emitEmbeddedMaskSetupInstrs = [&] { - if (intrin.op3->IsVectorZero() || (targetReg != falseReg) || (targetReg != embMaskOp1Reg)) + if (intrin.op3->IsFalseMask() || (targetReg != falseReg) || (targetReg != embMaskOp1Reg)) { return 1; } @@ -506,7 +506,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) }; auto emitEmbeddedMaskSetup = [&] { - if (intrin.op3->IsVectorZero()) + if (intrin.op3->IsFalseMask()) { // If `falseReg` is zero, then move the first operand of `intrinEmbMask` in the // destination using /Z. @@ -608,7 +608,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) if (intrin.op3->isContained()) { - assert(intrin.op3->IsVectorZero()); + assert(intrin.op3->IsFalseMask()); if (intrin.op1->isContained() || intrin.op1->IsTrueMask(node)) { @@ -750,7 +750,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // Predicate functionality is currently not exposed for this API, // but the FADDA instruction only has a predicated variant. // Thus, we expect the JIT to wrap this with CndSel. - assert(intrin.op3->IsVectorZero()); + assert(intrin.op3->IsFalseMask()); break; default: @@ -791,9 +791,9 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } }; - if (intrin.op3->IsVectorZero()) + if (intrin.op3->IsFalseMask()) { - // If `falseReg` is zero, then move the first operand of `intrinEmbMask` in the + // If `falseReg` is a false mask, then move the first operand of `intrinEmbMask` in the // destination using /Z. switch (intrinEmbMask.id) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 0598936681dc4d..c2e664f9306f13 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1974,7 +1974,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) bool foundUse = BlockRange().TryGetUse(node, &use); GenTree* trueMask = comp->gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); - GenTree* falseVal = comp->gtNewZeroConNode(simdType); + GenTree* falseVal = comp->gtNewSimdFalseMaskByteNode(); var_types nodeType = simdType; if (HWIntrinsicInfo::ReturnsPerElementMask(node->GetHWIntrinsicId())) @@ -3695,7 +3695,7 @@ bool Lowering::TryContainingCselOp(GenTreeHWIntrinsic* parentNode, GenTreeHWIntr bool canContain = false; var_types simdBaseType = parentNode->GetSimdBaseType(); - if (childNode->Op(3)->IsVectorZero()) + if (childNode->Op(3)->IsFalseMask()) { switch (parentNode->GetHWIntrinsicId()) { @@ -3952,12 +3952,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) GenTree* op3 = intrin.op3; // Handle op1 - if (op1->IsVectorZero()) + if (op1->IsFalseMask()) { // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. MakeSrcContained(node, op1); - JITDUMP("Containing vector zero op1 inside ConditionalSelect\n"); + JITDUMP("Containing false mask op1 inside ConditionalSelect\n"); DISPTREERANGE(BlockRange(), op1); } @@ -4023,13 +4023,13 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } // Handle op3 - if (op3->IsVectorZero() && op1->IsTrueMask(node) && op2->IsEmbMaskOp()) + if (op3->IsFalseMask() && op1->IsTrueMask(node) && op2->IsEmbMaskOp()) { // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. // Do this only if op1 was AllTrueMask MakeSrcContained(node, op3); - JITDUMP("Containing vector zero op3 inside ConditionalSelect\n"); + JITDUMP("Containing false mask op3 inside ConditionalSelect\n"); DISPTREERANGE(BlockRange(), op3); } @@ -4149,7 +4149,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) // optimisation when the nestedOp is a reduce operation. if (nestedOp1->IsTrueMask(cndSelNode) && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && - (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsVectorZero())) + (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsFalseMask())) { GenTree* nestedOp2 = nestedCndSel->Op(2); GenTree* nestedOp3 = nestedCndSel->Op(3); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 90c1a5ae7f4e03..5b27a508ad2dee 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9772,7 +9772,7 @@ GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* else if (node->IsVectorZero()) { // Morph the vector of zeroes into mask of zeroes. - GenTree* mask = gtNewSimdFalseMaskByteNode(parent->GetSimdSize()); + GenTree* mask = gtNewSimdFalseMaskByteNode(); mask->SetMorphed(this); return mask; } From a2077fdaa37549782959eb7ba14f0ee8d885991e Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Jun 2025 12:13:41 +0100 Subject: [PATCH 47/62] Add simdSize to GenTreeMskCon --- src/coreclr/jit/assertionprop.cpp | 5 ++++- src/coreclr/jit/compiler.h | 4 ++-- src/coreclr/jit/gentree.cpp | 9 +++++---- src/coreclr/jit/gentree.h | 3 ++- src/coreclr/jit/hwintrinsicarm64.cpp | 6 +++--- src/coreclr/jit/lowerarmarch.cpp | 2 +- src/coreclr/jit/morph.cpp | 2 +- 7 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index ed83b7d6384b33..a87bc4ee4578a2 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -3028,7 +3028,10 @@ GenTree* Compiler::optVNBasedFoldConstExpr(BasicBlock* block, GenTree* parent, G { simdmask_t value = vnStore->ConstantValue(vnCns); - GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet()); + assert(parent->OperIsHWIntrinsic()); + unsigned simdSize = parent->AsHWIntrinsic()->GetSimdSize(); + + GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet(), simdSize); memcpy(&mskCon->gtSimdMaskVal, &value, sizeof(simdmask_t)); conValTree = mskCon; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index bf1fc95c876191..840f04ab618588 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3026,7 +3026,7 @@ class Compiler #endif // FEATURE_SIMD #if defined(FEATURE_MASKED_HW_INTRINSICS) - GenTreeMskCon* gtNewMskConNode(var_types type); + GenTreeMskCon* gtNewMskConNode(var_types type, unsigned char simdSize); #endif // FEATURE_MASKED_HW_INTRINSICS GenTree* gtNewAllBitsSetConNode(var_types type); @@ -3133,7 +3133,7 @@ class Compiler #if defined(TARGET_ARM64) GenTree* gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize); - GenTree* gtNewSimdFalseMaskByteNode(); + GenTree* gtNewSimdFalseMaskByteNode(unsigned simdSize); #endif GenTree* gtNewSimdBinOpNode(genTreeOps op, diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ca961121157efc..0f33efbef2e63f 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7876,9 +7876,10 @@ GenTreeVecCon* Compiler::gtNewVconNode(var_types type, void* data) #endif // FEATURE_SIMD #if defined(FEATURE_MASKED_HW_INTRINSICS) -GenTreeMskCon* Compiler::gtNewMskConNode(var_types type) +GenTreeMskCon* Compiler::gtNewMskConNode(var_types type, unsigned char simdSize) { GenTreeMskCon* mskCon = new (this, GT_CNS_MSK) GenTreeMskCon(type); + mskCon->gtSimdSize = simdSize; return mskCon; } #endif // FEATURE_MASKED_HW_INTRINSICS @@ -9212,7 +9213,7 @@ GenTree* Compiler::gtClone(GenTree* tree, bool complexOK) #if defined(FEATURE_MASKED_HW_INTRINSICS) case GT_CNS_MSK: { - GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet()); + GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet(), tree->AsMskCon()->gtSimdSize); mskCon->gtSimdMaskVal = tree->AsMskCon()->gtSimdMaskVal; copy = mskCon; break; @@ -9403,7 +9404,7 @@ GenTree* Compiler::gtCloneExpr(GenTree* tree) #if defined(FEATURE_MASKED_HW_INTRINSICS) case GT_CNS_MSK: { - GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet()); + GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet(), tree->AsMskCon()->gtSimdSize); mskCon->gtSimdMaskVal = tree->AsMskCon()->gtSimdMaskVal; copy = mskCon; goto DONE; @@ -33307,7 +33308,7 @@ GenTreeMskCon* Compiler::gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, var_types retType = tree->TypeGet(); var_types simdBaseType = tree->GetSimdBaseType(); - GenTreeMskCon* mskCon = gtNewMskConNode(retType); + GenTreeMskCon* mskCon = gtNewMskConNode(retType, tree->GetSimdSize()); switch (vecCon->TypeGet()) { diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 29e8be65dfce5e..1b8970efcd93bd 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -7324,7 +7324,8 @@ struct GenTreeVecCon : public GenTree // struct GenTreeMskCon : public GenTree { - simdmask_t gtSimdMaskVal; + simdmask_t gtSimdMaskVal; + unsigned char gtSimdSize; // SIMD vector size in bytes void EvaluateUnaryInPlace(genTreeOps oper, bool scalar, var_types baseType, unsigned simdSize); void EvaluateBinaryInPlace( diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index e3bb88a044eb83..c4dd8f4d17975b 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -3409,7 +3409,7 @@ GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigne var_types simdType = getSIMDTypeForSize(simdSize); var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); bool found = false; - GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK, simdSize); switch (simdType) { @@ -3447,10 +3447,10 @@ GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigne // Return Value: // The mask // -GenTree* Compiler::gtNewSimdFalseMaskByteNode() +GenTree* Compiler::gtNewSimdFalseMaskByteNode(unsigned simdSize) { // Import as a constant mask 0 - GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK, simdSize); mskCon->gtSimdMaskVal = simdmask_t::Zero(); return mskCon; } diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index c2e664f9306f13..cb728ff0b7941c 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1974,7 +1974,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) bool foundUse = BlockRange().TryGetUse(node, &use); GenTree* trueMask = comp->gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); - GenTree* falseVal = comp->gtNewSimdFalseMaskByteNode(); + GenTree* falseVal = comp->gtNewSimdFalseMaskByteNode(simdSize); var_types nodeType = simdType; if (HWIntrinsicInfo::ReturnsPerElementMask(node->GetHWIntrinsicId())) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 5b27a508ad2dee..90c1a5ae7f4e03 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9772,7 +9772,7 @@ GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* else if (node->IsVectorZero()) { // Morph the vector of zeroes into mask of zeroes. - GenTree* mask = gtNewSimdFalseMaskByteNode(); + GenTree* mask = gtNewSimdFalseMaskByteNode(parent->GetSimdSize()); mask->SetMorphed(this); return mask; } From b9acb601f22a6dfccdf8bf9839b8d06b69d270c5 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Jun 2025 16:12:21 +0100 Subject: [PATCH 48/62] Use simdSize in GenTreeMskCon --- src/coreclr/jit/gentree.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 0f33efbef2e63f..c90f324babba23 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33371,24 +33371,20 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const if (IsCnsMsk()) { - switch (parent->gtType) + switch (AsMskCon()->gtSimdSize) { - case TYP_SIMD8: + case 8: return SveMaskPatternAll == EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); - case TYP_SIMD12: + case 12: return SveMaskPatternAll == EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); - case TYP_SIMD16: + case 16: return SveMaskPatternAll == EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); - case TYP_MASK: - // The mask parent does not have a vector type, so we cannot evaluate the size of the vector. - return false; - default: unreached(); } From a922091c3dc9ca9a19ca66c2c502cf82b12fd6e0 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Jun 2025 16:59:59 +0100 Subject: [PATCH 49/62] cndsel op3 is a vector --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 10 +++++----- src/coreclr/jit/lowerarmarch.cpp | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 10c148407e8247..29d5df70ad2c7d 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -498,7 +498,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // Shared code for setting up embedded mask arg for intrinsics with 3+ operands auto emitEmbeddedMaskSetupInstrs = [&] { - if (intrin.op3->IsFalseMask() || (targetReg != falseReg) || (targetReg != embMaskOp1Reg)) + if (intrin.op3->IsVectorZero() || (targetReg != falseReg) || (targetReg != embMaskOp1Reg)) { return 1; } @@ -506,7 +506,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) }; auto emitEmbeddedMaskSetup = [&] { - if (intrin.op3->IsFalseMask()) + if (intrin.op3->IsVectorZero()) { // If `falseReg` is zero, then move the first operand of `intrinEmbMask` in the // destination using /Z. @@ -750,7 +750,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // Predicate functionality is currently not exposed for this API, // but the FADDA instruction only has a predicated variant. // Thus, we expect the JIT to wrap this with CndSel. - assert(intrin.op3->IsFalseMask()); + assert(intrin.op3->IsVectorZero()); break; default: @@ -791,9 +791,9 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) } }; - if (intrin.op3->IsFalseMask()) + if (intrin.op3->IsVectorZero()) { - // If `falseReg` is a false mask, then move the first operand of `intrinEmbMask` in the + // If `falseReg` is zero, then move the first operand of `intrinEmbMask` in the // destination using /Z. switch (intrinEmbMask.id) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index cb728ff0b7941c..8f42583a986d34 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1962,7 +1962,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) node->Op(lastOpNum)->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Sve_ConditionalSelect && TryContainingCselOp(node, node->Op(lastOpNum)->AsHWIntrinsic())) { - JITDUMP("lowering EmbeddedMasked HWIntrinisic (after):\n"); + JITDUMP("contained CondSel inside HWIntrinisic (after):\n"); DISPTREERANGE(BlockRange(), node); JITDUMP("\n"); return node->gtNext; @@ -1974,7 +1974,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) bool foundUse = BlockRange().TryGetUse(node, &use); GenTree* trueMask = comp->gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); - GenTree* falseVal = comp->gtNewSimdFalseMaskByteNode(simdSize); + GenTree* falseVal = comp->gtNewZeroConNode(simdType); var_types nodeType = simdType; if (HWIntrinsicInfo::ReturnsPerElementMask(node->GetHWIntrinsicId())) @@ -3695,7 +3695,7 @@ bool Lowering::TryContainingCselOp(GenTreeHWIntrinsic* parentNode, GenTreeHWIntr bool canContain = false; var_types simdBaseType = parentNode->GetSimdBaseType(); - if (childNode->Op(3)->IsFalseMask()) + if (childNode->Op(3)->IsVectorZero()) { switch (parentNode->GetHWIntrinsicId()) { @@ -4023,7 +4023,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } // Handle op3 - if (op3->IsFalseMask() && op1->IsTrueMask(node) && op2->IsEmbMaskOp()) + if (op3->IsVectorZero() && op1->IsTrueMask(node) && op2->IsEmbMaskOp()) { // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. @@ -4149,7 +4149,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) // optimisation when the nestedOp is a reduce operation. if (nestedOp1->IsTrueMask(cndSelNode) && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && - (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsFalseMask())) + (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsVectorZero())) { GenTree* nestedOp2 = nestedCndSel->Op(2); GenTree* nestedOp3 = nestedCndSel->Op(3); From 1acdf014c0887b86fda26477f5300b1c4fbc53ae Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Mon, 9 Jun 2025 17:02:18 +0100 Subject: [PATCH 50/62] use unsigned instead of unsigned char --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/gentree.cpp | 4 ++-- src/coreclr/jit/gentree.h | 4 ++-- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 840f04ab618588..c4d46b050816b7 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3026,7 +3026,7 @@ class Compiler #endif // FEATURE_SIMD #if defined(FEATURE_MASKED_HW_INTRINSICS) - GenTreeMskCon* gtNewMskConNode(var_types type, unsigned char simdSize); + GenTreeMskCon* gtNewMskConNode(var_types type, unsigned simdSize); #endif // FEATURE_MASKED_HW_INTRINSICS GenTree* gtNewAllBitsSetConNode(var_types type); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index c90f324babba23..90367a3c8e186b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7876,10 +7876,10 @@ GenTreeVecCon* Compiler::gtNewVconNode(var_types type, void* data) #endif // FEATURE_SIMD #if defined(FEATURE_MASKED_HW_INTRINSICS) -GenTreeMskCon* Compiler::gtNewMskConNode(var_types type, unsigned char simdSize) +GenTreeMskCon* Compiler::gtNewMskConNode(var_types type, unsigned simdSize) { GenTreeMskCon* mskCon = new (this, GT_CNS_MSK) GenTreeMskCon(type); - mskCon->gtSimdSize = simdSize; + mskCon->gtSimdSize = simdSize; return mskCon; } #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 1b8970efcd93bd..65bc5457f83913 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -7324,8 +7324,8 @@ struct GenTreeVecCon : public GenTree // struct GenTreeMskCon : public GenTree { - simdmask_t gtSimdMaskVal; - unsigned char gtSimdSize; // SIMD vector size in bytes + simdmask_t gtSimdMaskVal; + unsigned gtSimdSize; // SIMD vector size in bytes void EvaluateUnaryInPlace(genTreeOps oper, bool scalar, var_types baseType, unsigned simdSize); void EvaluateBinaryInPlace( diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 29d5df70ad2c7d..e3fb77459446b9 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -608,7 +608,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) if (intrin.op3->isContained()) { - assert(intrin.op3->IsFalseMask()); + assert(intrin.op3->IsVectorZero()); if (intrin.op1->isContained() || intrin.op1->IsTrueMask(node)) { From 80a0ae7935adf1057558cf5f226a3a5a6147ecbc Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 10 Jun 2025 09:53:41 +0100 Subject: [PATCH 51/62] Fix HasDisasmCheck --- src/tests/JIT/opt/SVE/ConstantMasks.csproj | 2 +- src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.csproj | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/JIT/opt/SVE/ConstantMasks.csproj b/src/tests/JIT/opt/SVE/ConstantMasks.csproj index ed531920304c5e..5482afbaa21aa8 100644 --- a/src/tests/JIT/opt/SVE/ConstantMasks.csproj +++ b/src/tests/JIT/opt/SVE/ConstantMasks.csproj @@ -10,7 +10,7 @@ - true + true diff --git a/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.csproj b/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.csproj index ed531920304c5e..5482afbaa21aa8 100644 --- a/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.csproj +++ b/src/tests/JIT/opt/SVE/ConstantMasksOp2Fixed.csproj @@ -10,7 +10,7 @@ - true + true From 17f8ab2ffc90b849fd0377feb78bd9be6c00b1ad Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 10 Jun 2025 10:08:44 +0100 Subject: [PATCH 52/62] Hardcode mask simd size to 16 --- src/coreclr/jit/assertionprop.cpp | 5 +--- src/coreclr/jit/compiler.h | 6 ++--- src/coreclr/jit/gentree.cpp | 30 ++++++---------------- src/coreclr/jit/gentree.h | 1 - src/coreclr/jit/hwintrinsicarm64.cpp | 37 +++++----------------------- src/coreclr/jit/lowerarmarch.cpp | 2 +- src/coreclr/jit/morph.cpp | 2 +- 7 files changed, 20 insertions(+), 63 deletions(-) diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index a87bc4ee4578a2..ed83b7d6384b33 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -3028,10 +3028,7 @@ GenTree* Compiler::optVNBasedFoldConstExpr(BasicBlock* block, GenTree* parent, G { simdmask_t value = vnStore->ConstantValue(vnCns); - assert(parent->OperIsHWIntrinsic()); - unsigned simdSize = parent->AsHWIntrinsic()->GetSimdSize(); - - GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet(), simdSize); + GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet()); memcpy(&mskCon->gtSimdMaskVal, &value, sizeof(simdmask_t)); conValTree = mskCon; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index c4d46b050816b7..f3ab0bd75a424c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3026,7 +3026,7 @@ class Compiler #endif // FEATURE_SIMD #if defined(FEATURE_MASKED_HW_INTRINSICS) - GenTreeMskCon* gtNewMskConNode(var_types type, unsigned simdSize); + GenTreeMskCon* gtNewMskConNode(var_types type); #endif // FEATURE_MASKED_HW_INTRINSICS GenTree* gtNewAllBitsSetConNode(var_types type); @@ -3132,8 +3132,8 @@ class Compiler var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); #if defined(TARGET_ARM64) - GenTree* gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize); - GenTree* gtNewSimdFalseMaskByteNode(unsigned simdSize); + GenTree* gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType); + GenTree* gtNewSimdFalseMaskByteNode(); #endif GenTree* gtNewSimdBinOpNode(genTreeOps op, diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 90367a3c8e186b..b9af918a719d1c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7876,10 +7876,9 @@ GenTreeVecCon* Compiler::gtNewVconNode(var_types type, void* data) #endif // FEATURE_SIMD #if defined(FEATURE_MASKED_HW_INTRINSICS) -GenTreeMskCon* Compiler::gtNewMskConNode(var_types type, unsigned simdSize) +GenTreeMskCon* Compiler::gtNewMskConNode(var_types type) { GenTreeMskCon* mskCon = new (this, GT_CNS_MSK) GenTreeMskCon(type); - mskCon->gtSimdSize = simdSize; return mskCon; } #endif // FEATURE_MASKED_HW_INTRINSICS @@ -9213,7 +9212,7 @@ GenTree* Compiler::gtClone(GenTree* tree, bool complexOK) #if defined(FEATURE_MASKED_HW_INTRINSICS) case GT_CNS_MSK: { - GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet(), tree->AsMskCon()->gtSimdSize); + GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet()); mskCon->gtSimdMaskVal = tree->AsMskCon()->gtSimdMaskVal; copy = mskCon; break; @@ -9404,7 +9403,7 @@ GenTree* Compiler::gtCloneExpr(GenTree* tree) #if defined(FEATURE_MASKED_HW_INTRINSICS) case GT_CNS_MSK: { - GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet(), tree->AsMskCon()->gtSimdSize); + GenTreeMskCon* mskCon = gtNewMskConNode(tree->TypeGet()); mskCon->gtSimdMaskVal = tree->AsMskCon()->gtSimdMaskVal; copy = mskCon; goto DONE; @@ -33308,7 +33307,7 @@ GenTreeMskCon* Compiler::gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, var_types retType = tree->TypeGet(); var_types simdBaseType = tree->GetSimdBaseType(); - GenTreeMskCon* mskCon = gtNewMskConNode(retType, tree->GetSimdSize()); + GenTreeMskCon* mskCon = gtNewMskConNode(retType); switch (vecCon->TypeGet()) { @@ -33369,25 +33368,12 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const #ifdef TARGET_ARM64 var_types ParentSimdBaseType = JitType2PreciseVarType(parent->GetSimdBaseJitType()); + // TODO-SVE: For agnostic VL, vector type may not be simd16_t + if (IsCnsMsk()) { - switch (AsMskCon()->gtSimdSize) - { - case 8: - return SveMaskPatternAll == - EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); - - case 12: - return SveMaskPatternAll == - EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); - - case 16: - return SveMaskPatternAll == - EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); - - default: - unreached(); - } + return SveMaskPatternAll == + EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); } #endif diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 65bc5457f83913..29e8be65dfce5e 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -7325,7 +7325,6 @@ struct GenTreeVecCon : public GenTree struct GenTreeMskCon : public GenTree { simdmask_t gtSimdMaskVal; - unsigned gtSimdSize; // SIMD vector size in bytes void EvaluateUnaryInPlace(genTreeOps oper, bool scalar, var_types baseType, unsigned simdSize); void EvaluateBinaryInPlace( diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index c4dd8f4d17975b..4ba69cfbbc7fc9 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -3397,45 +3397,20 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, // // Arguments: // simdBaseJitType -- the base jit type of the nodes being masked -// simdSize -- the simd size of the nodes being masked // // Return Value: // The mask // -GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType) { // Import as a constant mask - var_types simdType = getSIMDTypeForSize(simdSize); var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); - bool found = false; - GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK, simdSize); + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); - switch (simdType) - { - case TYP_SIMD8: - { - found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); - break; - } + // TODO-SVE: For agnostic VL, vector type may not be simd16_t - case TYP_SIMD12: - { - found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); - break; - } - - case TYP_SIMD16: - { - found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); - break; - } - - default: - { - unreached(); - } - } + bool found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); assert(found); return mskCon; @@ -3447,10 +3422,10 @@ GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigne // Return Value: // The mask // -GenTree* Compiler::gtNewSimdFalseMaskByteNode(unsigned simdSize) +GenTree* Compiler::gtNewSimdFalseMaskByteNode() { // Import as a constant mask 0 - GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK, simdSize); + GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); mskCon->gtSimdMaskVal = simdmask_t::Zero(); return mskCon; } diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 8f42583a986d34..efba9f6264fca4 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1973,7 +1973,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) var_types simdType = Compiler::getSIMDTypeForSize(simdSize); bool foundUse = BlockRange().TryGetUse(node, &use); - GenTree* trueMask = comp->gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); + GenTree* trueMask = comp->gtNewSimdAllTrueMaskNode(simdBaseJitType); GenTree* falseVal = comp->gtNewZeroConNode(simdType); var_types nodeType = simdType; diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 90c1a5ae7f4e03..5b27a508ad2dee 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9772,7 +9772,7 @@ GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* else if (node->IsVectorZero()) { // Morph the vector of zeroes into mask of zeroes. - GenTree* mask = gtNewSimdFalseMaskByteNode(parent->GetSimdSize()); + GenTree* mask = gtNewSimdFalseMaskByteNode(); mask->SetMorphed(this); return mask; } From 5aa14cebbe8e15b944aa65dee216d269c9bcd0b6 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Tue, 10 Jun 2025 12:23:03 +0100 Subject: [PATCH 53/62] formatting --- src/coreclr/jit/gentree.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index b9af918a719d1c..8df58c2e09df36 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33372,8 +33372,7 @@ bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const if (IsCnsMsk()) { - return SveMaskPatternAll == - EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + return SveMaskPatternAll == EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); } #endif From 267bd754ba590b0e607a3e006aeef0d902f527dd Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 11 Jun 2025 17:14:22 +0100 Subject: [PATCH 54/62] remove TODO --- src/coreclr/jit/hwintrinsicarm64.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 4ba69cfbbc7fc9..0ca473a49e3d48 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2882,8 +2882,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(sig->numArgs == 1); op1 = impPopStack().val; - // TODO: For AOT, always do the gtNewSimdHWIntrinsicNode as we don't know the vector size. - // Where possible, import a constant mask to allow for optimisations. if (op1->IsIntegralConst()) { From c95b68a5dbc48dcef42ab68baafaae3784837565 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Wed, 11 Jun 2025 17:27:24 +0100 Subject: [PATCH 55/62] Use simdBaseType for IsTrueMask arg --- src/coreclr/jit/gentree.cpp | 17 ++++++++--------- src/coreclr/jit/gentree.h | 2 +- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 4 ++-- src/coreclr/jit/lowerarmarch.cpp | 6 +++--- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 8df58c2e09df36..79f062178f7c99 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -33233,7 +33233,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } - if (op1->IsVectorAllBitsSet() || op1->IsTrueMask(tree)) + if (op1->IsVectorAllBitsSet() || op1->IsTrueMask(simdBaseType)) { if ((op3->gtFlags & GTF_SIDE_EFFECT) != 0) { @@ -33356,23 +33356,22 @@ GenTreeMskCon* Compiler::gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, // IsTrueMask: Is the given node a true mask // // Arguments: -// parent - parent of the node +// simdBaseType - the base type of the mask // -// Returns true if the node is a true mask for the given parent. +// Returns true if the node is a true mask for the given simdBaseType. // -// Note that a byte true mask is different to an int true mask, therefore -// the usage of the mask (ie the type of the parent) needs to be taken into account. +// Note that a byte true mask (1111...) is different to an int true mask +// (10001000...), therefore the simdBaseType of the mask needs to be +// taken into account. // -bool GenTree::IsTrueMask(GenTreeHWIntrinsic* parent) const +bool GenTree::IsTrueMask(var_types simdBaseType) const { #ifdef TARGET_ARM64 - var_types ParentSimdBaseType = JitType2PreciseVarType(parent->GetSimdBaseJitType()); - // TODO-SVE: For agnostic VL, vector type may not be simd16_t if (IsCnsMsk()) { - return SveMaskPatternAll == EvaluateSimdMaskToPattern(ParentSimdBaseType, AsMskCon()->gtSimdMaskVal); + return SveMaskPatternAll == EvaluateSimdMaskToPattern(simdBaseType, AsMskCon()->gtSimdMaskVal); } #endif diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 29e8be65dfce5e..7f3f7ad4d900e0 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1820,7 +1820,7 @@ struct GenTree inline bool IsVectorBroadcast(var_types simdBaseType) const; #ifdef FEATURE_HW_INTRINSICS - bool IsTrueMask(GenTreeHWIntrinsic* parent) const; + bool IsTrueMask(var_types simdBaseType) const; bool IsFalseMask() const; #endif diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index e3fb77459446b9..932335002b3341 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -512,7 +512,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) // destination using /Z. assert((targetReg != embMaskOp2Reg) || (embMaskOp1Reg == embMaskOp2Reg)); - assert(intrin.op3->isContained() || !intrin.op1->IsTrueMask(node)); + assert(intrin.op3->isContained() || !intrin.op1->IsTrueMask(node->GetSimdBaseType())); GetEmitter()->emitInsSve_R_R_R(INS_sve_movprfx, emitSize, targetReg, maskReg, embMaskOp1Reg, opt); } else @@ -610,7 +610,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { assert(intrin.op3->IsVectorZero()); - if (intrin.op1->isContained() || intrin.op1->IsTrueMask(node)) + if (intrin.op1->isContained() || intrin.op1->IsTrueMask(node->GetSimdBaseType())) { // We already skip importing ConditionalSelect if op1 == trueAll, however // if we still see it here, it is because we wrapped the predicated instruction diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index efba9f6264fca4..e87bc51dbde65a 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -4023,7 +4023,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } // Handle op3 - if (op3->IsVectorZero() && op1->IsTrueMask(node) && op2->IsEmbMaskOp()) + if (op3->IsVectorZero() && op1->IsTrueMask(node->GetSimdBaseType()) && op2->IsEmbMaskOp()) { // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. @@ -4148,7 +4148,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) // op3 is all zeros. Such a Csel operation is absorbed into the instruction when emitted. Skip this // optimisation when the nestedOp is a reduce operation. - if (nestedOp1->IsTrueMask(cndSelNode) && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && + if (nestedOp1->IsTrueMask(cndSelNode->GetSimdBaseType()) && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsVectorZero())) { GenTree* nestedOp2 = nestedCndSel->Op(2); @@ -4177,7 +4177,7 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) } } } - else if (op1->IsTrueMask(cndSelNode)) + else if (op1->IsTrueMask(cndSelNode->GetSimdBaseType())) { // Any case where op2 is not an embedded HWIntrinsic if (!op2->OperIsHWIntrinsic() || From f0508a711c380094a0121da441b58736c31bcf98 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Jun 2025 09:24:01 +0100 Subject: [PATCH 56/62] Add asserts to gtFoldExprHWIntrinsic --- src/coreclr/jit/gentree.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 0fbdf8e3817da6..f9b725623840fc 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32792,6 +32792,17 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) break; } +#if defined(TARGET_ARM64) + if (ni == NI_Sve_ConditionalSelect) + { + assert(!op1->IsVectorAllBitsSet() && !op1->IsVectorZero()); + } + else + { + assert(!op1->IsTrueMask(simdBaseType) && !op1->IsFalseMask()); + } +#endif + if (op1->IsVectorAllBitsSet() || op1->IsTrueMask(simdBaseType)) { if ((op3->gtFlags & GTF_SIDE_EFFECT) != 0) From 4c91658d3e7a00e0a55b293ac6f83954f4806252 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Jun 2025 09:27:38 +0100 Subject: [PATCH 57/62] Simplify IsFalseMask --- src/coreclr/jit/gentree.cpp | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index f9b725623840fc..c5c132e38cbf5a 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32951,18 +32951,7 @@ bool GenTree::IsTrueMask(var_types simdBaseType) const bool GenTree::IsFalseMask() const { #ifdef TARGET_ARM64 - if (OperIsHWIntrinsic()) - { - NamedIntrinsic id = AsHWIntrinsic()->GetHWIntrinsicId(); - if (id == NI_Sve_ConvertMaskToVector) - { - GenTree* op1 = AsHWIntrinsic()->Op(1); - assert(op1->OperIsHWIntrinsic()); - id = op1->AsHWIntrinsic()->GetHWIntrinsicId(); - } - return (id == NI_Sve_CreateFalseMaskByte); - } - else if (IsCnsMsk()) + if (IsCnsMsk()) { return AsMskCon()->IsZero(); } From afb4c3a585160c32ac7e09ca51687ba1e3485d42 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Jun 2025 11:00:49 +0100 Subject: [PATCH 58/62] inline IsTrueMask/IsFalseMask --- src/coreclr/jit/gentree.cpp | 38 ------------------------------ src/coreclr/jit/gentree.h | 47 +++++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index c5c132e38cbf5a..51fb62419a3653 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32922,44 +32922,6 @@ GenTreeMskCon* Compiler::gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, return mskCon; } -//------------------------------------------------------------------------ -// IsTrueMask: Is the given node a true mask -// -// Arguments: -// simdBaseType - the base type of the mask -// -// Returns true if the node is a true mask for the given simdBaseType. -// -// Note that a byte true mask (1111...) is different to an int true mask -// (10001000...), therefore the simdBaseType of the mask needs to be -// taken into account. -// -bool GenTree::IsTrueMask(var_types simdBaseType) const -{ -#ifdef TARGET_ARM64 - // TODO-SVE: For agnostic VL, vector type may not be simd16_t - - if (IsCnsMsk()) - { - return SveMaskPatternAll == EvaluateSimdMaskToPattern(simdBaseType, AsMskCon()->gtSimdMaskVal); - } -#endif - - return false; -} - -bool GenTree::IsFalseMask() const -{ -#ifdef TARGET_ARM64 - if (IsCnsMsk()) - { - return AsMskCon()->IsZero(); - } -#endif - - return false; -} - #endif // FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index bc92e1b2ae8d79..4569eee63bf787 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1804,8 +1804,8 @@ struct GenTree inline bool IsVectorBroadcast(var_types simdBaseType) const; #ifdef FEATURE_HW_INTRINSICS - bool IsTrueMask(var_types simdBaseType) const; - bool IsFalseMask() const; + inline bool IsTrueMask(var_types simdBaseType) const; + inline bool IsFalseMask() const; #endif inline uint64_t GetIntegralVectorConstElement(size_t index, var_types simdBaseType); @@ -9553,6 +9553,49 @@ inline bool GenTree::IsVectorBroadcast(var_types simdBaseType) const return false; } +//------------------------------------------------------------------------ +// IsTrueMask: Is the given node a true mask +// +// Arguments: +// simdBaseType - the base type of the mask +// +// Returns true if the node is a true mask for the given simdBaseType. +// +// Note that a byte true mask (1111...) is different to an int true mask +// (10001000...), therefore the simdBaseType of the mask needs to be +// taken into account. +// +inline bool GenTree::IsTrueMask(var_types simdBaseType) const +{ +#ifdef TARGET_ARM64 + // TODO-SVE: For agnostic VL, vector type may not be simd16_t + + if (IsCnsMsk()) + { + return SveMaskPatternAll == EvaluateSimdMaskToPattern(simdBaseType, AsMskCon()->gtSimdMaskVal); + } +#endif + + return false; +} + +//------------------------------------------------------------------------ +// IsFalseMask: Is the given node a false mask +// +// Returns true if the node is a false mask, ie all zeros +// +inline bool GenTree::IsFalseMask() const +{ +#ifdef TARGET_ARM64 + if (IsCnsMsk()) + { + return AsMskCon()->IsZero(); + } +#endif + + return false; +} + //------------------------------------------------------------------- // GetIntegralVectorConstElement: Gets the value of a given element in an integral vector constant // From e8aee07570ece10b79feead9b5d77941e255ae22 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Jun 2025 11:06:38 +0100 Subject: [PATCH 59/62] Use LABELEDDISPTREERANGE --- src/coreclr/jit/lowerarmarch.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 2c99e68cec8d21..1d69b329e760bd 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -3947,8 +3947,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // When we are merging with zero, we can specialize // and avoid instantiating the vector constant. MakeSrcContained(node, op1); - JITDUMP("Containing false mask op1 inside ConditionalSelect\n"); - DISPTREERANGE(BlockRange(), op1); + LABELEDDISPTREERANGE("Contained false mask op1 in ConditionalSelect", BlockRange(), op1); } // Handle op2 @@ -3993,8 +3992,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { MakeSrcContained(node, op2); op2->MakeEmbMaskOp(); - JITDUMP("Containing op2 inside ConditionalSelect\n"); - DISPTREERANGE(BlockRange(), node); + LABELEDDISPTREERANGE("Contained op2 in ConditionalSelect", BlockRange(), node); } } @@ -4006,8 +4004,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (embOp->Op(2)->IsCnsIntOrI()) { MakeSrcContained(op2, embOp->Op(2)); - JITDUMP("Containing ShiftRight op2 inside ConditionalSelect\n"); - DISPTREERANGE(BlockRange(), op2); + LABELEDDISPTREERANGE("Contained ShiftRight in ConditionalSelect", BlockRange(), op2); } } } @@ -4019,8 +4016,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // and avoid instantiating the vector constant. // Do this only if op1 was AllTrueMask MakeSrcContained(node, op3); - JITDUMP("Containing false mask op3 inside ConditionalSelect\n"); - DISPTREERANGE(BlockRange(), op3); + LABELEDDISPTREERANGE("Contained false mask op3 in ConditionalSelect", BlockRange(), op3); } break; @@ -4137,13 +4133,14 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) // op3 is all zeros. Such a Csel operation is absorbed into the instruction when emitted. Skip this // optimisation when the nestedOp is a reduce operation. - if (nestedOp1->IsTrueMask(cndSelNode->GetSimdBaseType()) && !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && + if (nestedOp1->IsTrueMask(cndSelNode->GetSimdBaseType()) && + !HWIntrinsicInfo::IsReduceOperation(nestedOp2Id) && (!HWIntrinsicInfo::IsZeroingMaskedOperation(nestedOp2Id) || op3->IsVectorZero())) { GenTree* nestedOp2 = nestedCndSel->Op(2); GenTree* nestedOp3 = nestedCndSel->Op(3); - LABELEDDISPTREERANGE("Removed nested conditionalselect (before):", BlockRange(), cndSelNode); + LABELEDDISPTREERANGE("Removed nested conditionalselect (before)", BlockRange(), cndSelNode); // Transform: // From 85b4da85f50de6f1516c266fa6e459c87c4339a6 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Jun 2025 11:13:42 +0100 Subject: [PATCH 60/62] Add header to gtFoldExprConvertVecCnsToMask --- src/coreclr/jit/gentree.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 51fb62419a3653..ae3a96765bd38e 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -32870,6 +32870,17 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) return resultNode; } +//------------------------------------------------------------------------------ +// gtFoldExprConvertVecCnsToMask: Folds a constant vector plus conversion to +// mask into a constant mask. +// +// Arguments: +// tree - The convert vector to mask node +// vecCon - The vector constant converted by the convert +// +// Return Value: +// Returns a constant mask +// GenTreeMskCon* Compiler::gtFoldExprConvertVecCnsToMask(GenTreeHWIntrinsic* tree, GenTreeVecCon* vecCon) { assert(tree->OperIsConvertVectorToMask()); From 1b037e4e2c83ddae6aa0898fabac11fe05f288b6 Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Jun 2025 11:50:08 +0100 Subject: [PATCH 61/62] Remove FEATURE_HW_INTRINSICS around IsTrueMask/IsFalseMask --- src/coreclr/jit/gentree.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 4569eee63bf787..c5d49fbacfca3a 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1802,11 +1802,8 @@ struct GenTree inline bool IsVectorCreate() const; inline bool IsVectorAllBitsSet() const; inline bool IsVectorBroadcast(var_types simdBaseType) const; - -#ifdef FEATURE_HW_INTRINSICS inline bool IsTrueMask(var_types simdBaseType) const; inline bool IsFalseMask() const; -#endif inline uint64_t GetIntegralVectorConstElement(size_t index, var_types simdBaseType); From a1f703cc6d6950c8014ba61f962b38d03b095bef Mon Sep 17 00:00:00 2001 From: Alan Hayward Date: Fri, 13 Jun 2025 16:00:44 +0100 Subject: [PATCH 62/62] turn off fgMorphTryUseAllMaskVariant --- src/coreclr/jit/morph.cpp | 5 ++++- src/tests/JIT/opt/SVE/PredicateInstructions.cs | 12 ------------ 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 9843e1adc58687..9af815bf726c7d 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9727,7 +9727,10 @@ GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* n return node; } #elif defined(TARGET_ARM64) - return fgMorphTryUseAllMaskVariant(node); + // TODO-SVE: This optimisation is too naive. It needs to calculate the full cost of the instruction + // vs using the predicate version, taking into account all input arguements and all uses + // of the result. + // return fgMorphTryUseAllMaskVariant(node); #else #error Unsupported platform #endif diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.cs b/src/tests/JIT/opt/SVE/PredicateInstructions.cs index 41b09c1fad3898..b1336674f1638b 100644 --- a/src/tests/JIT/opt/SVE/PredicateInstructions.cs +++ b/src/tests/JIT/opt/SVE/PredicateInstructions.cs @@ -35,56 +35,48 @@ public static void TestPredicateInstructions() [MethodImpl(MethodImplOptions.NoInlining)] static Vector ZipLow() { - //ARM64-FULL-LINE: zip1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h return Sve.ZipLow(Vector.Zero, Sve.CreateTrueMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector ZipHigh() { - //ARM64-FULL-LINE: zip2 {{p[0-9]+}}.s, {{p[0-9]+}}.s, {{p[0-9]+}}.s return Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), Sve.CreateTrueMaskUInt32()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector UnzipEven() { - //ARM64-FULL-LINE: uzp1 {{p[0-9]+}}.b, {{p[0-9]+}}.b, {{p[0-9]+}}.b return Sve.UnzipEven(Sve.CreateTrueMaskSByte(), Vector.Zero); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector UnzipOdd() { - //ARM64-FULL-LINE: uzp2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h return Sve.UnzipOdd(Sve.CreateTrueMaskInt16(), Sve.CreateFalseMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector TransposeEven() { - //ARM64-FULL-LINE: trn1 {{p[0-9]+}}.d, {{p[0-9]+}}.d, {{p[0-9]+}}.d return Sve.TransposeEven(Sve.CreateFalseMaskInt64(), Sve.CreateTrueMaskInt64()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector TransposeOdd() { - //ARM64-FULL-LINE: trn2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h return Sve.TransposeOdd(Vector.Zero, Sve.CreateTrueMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector ReverseElement() { - //ARM64-FULL-LINE: rev {{p[0-9]+}}.h, {{p[0-9]+}}.h return Sve.ReverseElement(Sve.CreateTrueMaskInt16()); } [MethodImpl(MethodImplOptions.NoInlining)] static Vector And() { - //ARM64-FULL-LINE: and {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b return Sve.ConditionalSelect( Sve.CreateTrueMaskInt16(), Sve.And(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), @@ -95,7 +87,6 @@ static Vector And() [MethodImpl(MethodImplOptions.NoInlining)] static Vector BitwiseClear() { - //ARM64-FULL-LINE: bic {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b return Sve.ConditionalSelect( Sve.CreateFalseMaskInt16(), Sve.BitwiseClear(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), @@ -106,7 +97,6 @@ static Vector BitwiseClear() [MethodImpl(MethodImplOptions.NoInlining)] static Vector Xor() { - //ARM64-FULL-LINE: eor {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b return Sve.ConditionalSelect( Sve.CreateTrueMaskInt32(), Sve.Xor(Sve.CreateTrueMaskInt32(), Sve.CreateTrueMaskInt32()), @@ -117,7 +107,6 @@ static Vector Xor() [MethodImpl(MethodImplOptions.NoInlining)] static Vector Or() { - //ARM64-FULL-LINE: orr {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b return Sve.ConditionalSelect( Sve.CreateTrueMaskInt16(), Sve.Or(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), @@ -128,7 +117,6 @@ static Vector Or() [MethodImpl(MethodImplOptions.NoInlining)] static Vector ConditionalSelect() { - //ARM64-FULL-LINE: sel {{p[0-9]+}}.b, {{p[0-9]+}}, {{p[0-9]+}}.b, {{p[0-9]+}}.b return Sve.ConditionalSelect( Vector.Zero, Sve.CreateFalseMaskInt32(),