Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
3a130c8
unblock long xplat intrinsics on x86
saucecontrol Feb 21, 2025
7f220c2
tidying
saucecontrol Feb 21, 2025
78dc31d
tidying2
saucecontrol Feb 21, 2025
7330c3e
Merge remote-tracking branch 'upstream/main' into createscalar64
saucecontrol Feb 22, 2025
69065ee
remove CreateScalarUnsafe opt for small loads
saucecontrol Feb 22, 2025
86ebdae
skip more redundant casts for CreateScalar of small types
saucecontrol Feb 22, 2025
cdb0910
use temp reg for CreateScalar float SSE fallback
saucecontrol Feb 22, 2025
5d6fb3f
formatting patch
saucecontrol Feb 22, 2025
bb03516
simplify storeind containment of ToScalar
saucecontrol Feb 23, 2025
cba4ab0
don't use temp reg for CreateScalar float SSE fallback
saucecontrol Feb 23, 2025
1f97bd9
Merge remote-tracking branch 'upstream/main' into createscalar64
saucecontrol Feb 23, 2025
71145ab
Merge remote-tracking branch 'upstream/main' into createscalar64
saucecontrol Feb 24, 2025
42a6ab8
skip cast on other memory loads
saucecontrol Feb 24, 2025
1c98e23
use proper containment check
saucecontrol Feb 24, 2025
c80c566
Merge remote-tracking branch 'upstream/main' into createscalar64
saucecontrol Feb 25, 2025
fb2cf30
Merge remote-tracking branch 'upstream/main' into createscalar64
saucecontrol Feb 26, 2025
3a76030
Merge remote-tracking branch 'upstream/main' into createscalar64
saucecontrol Mar 11, 2025
af01862
add more validation, remove CreateSequence restriction
saucecontrol Mar 13, 2025
811e16e
Merge remote-tracking branch 'upstream/main' into createscalar64
saucecontrol Mar 19, 2025
aebbf68
use appropriate helpers for decomposing ToScalar
saucecontrol Mar 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5685,6 +5685,13 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
// These intrinsics are "ins reg/mem, xmm"
ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
attr = emitActualTypeSize(baseType);
#if defined(TARGET_X86)
if (varTypeIsLong(baseType))
{
ins = INS_movq;
attr = EA_8BYTE;
}
#endif // TARGET_X86
break;
}

Expand Down
174 changes: 141 additions & 33 deletions src/coreclr/jit/decomposelongs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,19 +78,19 @@ void DecomposeLongs::DecomposeBlock(BasicBlock* block)
// Return Value:
// None.
//
void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range)
void DecomposeLongs::DecomposeRange(Compiler* compiler, Lowering* lowering, LIR::Range& range)
{
assert(compiler != nullptr);

DecomposeLongs decomposer(compiler);
DecomposeLongs decomposer(compiler, lowering);
decomposer.m_range = ⦥

decomposer.DecomposeRangeHelper();
}

//------------------------------------------------------------------------
// DecomposeLongs::DecomposeRangeHelper:
// Decompiose each node in the current range.
// Decompose each node in the current range.
//
// Decomposition is done as an execution-order walk. Decomposition of
// a particular node can create new nodes that need to be further
Expand Down Expand Up @@ -122,44 +122,76 @@ void DecomposeLongs::DecomposeRangeHelper()
GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
{
// Handle the case where we are implicitly using the lower half of a long lclVar.
if ((tree->TypeGet() == TYP_INT) && tree->OperIsLocal())
if (tree->TypeIs(TYP_INT) && tree->OperIsLocal())
{
LclVarDsc* varDsc = m_compiler->lvaGetDesc(tree->AsLclVarCommon());
if (varTypeIsLong(varDsc) && varDsc->lvPromoted)
{
#ifdef DEBUG
if (m_compiler->verbose)
{
printf("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
"half:\n");
m_compiler->gtDispTreeRange(Range(), tree);
}
#endif // DEBUG
JITDUMP("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
"half:\n");
DISPTREERANGE(Range(), tree);

unsigned loVarNum = varDsc->lvFieldLclStart;
tree->AsLclVarCommon()->SetLclNum(loVarNum);
return tree->gtNext;
}
}

if (tree->TypeGet() != TYP_LONG)
if (!tree->TypeIs(TYP_LONG))
{
return tree->gtNext;
}

#ifdef DEBUG
if (m_compiler->verbose)
{
printf("Decomposing TYP_LONG tree. BEFORE:\n");
m_compiler->gtDispTreeRange(Range(), tree);
}
#endif // DEBUG

LIR::Use use;
if (!Range().TryGetUse(tree, &use))
{
LIR::Use::MakeDummyUse(Range(), tree, &use);
}

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
if (!use.IsDummyUse())
{
// HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
// Here we do a conservative check for specific cases where it is certain the load/store
// can be contained. In those cases, we can skip decomposition.

GenTree* user = use.User();

if (user->OperIsHWIntrinsic())
{
if (tree->OperIs(GT_CNS_LNG) ||
(tree->OperIs(GT_IND, GT_LCL_FLD) && m_lowering->IsSafeToContainMem(user, tree)))
{
NamedIntrinsic intrinsicId = user->AsHWIntrinsic()->GetHWIntrinsicId();
assert(HWIntrinsicInfo::IsVectorCreate(intrinsicId) ||
HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) ||
HWIntrinsicInfo::IsVectorCreateScalarUnsafe(intrinsicId));

return tree->gtNext;
}
}
else if (user->OperIs(GT_STOREIND) && tree->OperIsHWIntrinsic() && m_compiler->opts.OptimizationEnabled())
{
NamedIntrinsic intrinsicId = tree->AsHWIntrinsic()->GetHWIntrinsicId();
if (HWIntrinsicInfo::IsVectorToScalar(intrinsicId) && m_lowering->IsSafeToContainMem(user, tree))
{
return tree->gtNext;
}
}
}

if (tree->OperIs(GT_STOREIND) && tree->AsStoreInd()->Data()->OperIsHWIntrinsic())
{
// We should only get here if we matched the second pattern above.
assert(HWIntrinsicInfo::IsVectorToScalar(tree->AsStoreInd()->Data()->AsHWIntrinsic()->GetHWIntrinsicId()));

return tree->gtNext;
}
#endif // FEATURE_HW_INTRINSICS && TARGET_X86

JITDUMP("Decomposing TYP_LONG tree. BEFORE:\n");
DISPTREERANGE(Range(), tree);

GenTree* nextNode = nullptr;
switch (tree->OperGet())
{
Expand Down Expand Up @@ -270,19 +302,14 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)

// If we replaced the argument to a GT_FIELD_LIST element with a GT_LONG node, split that field list
// element into two elements: one for each half of the GT_LONG.
if ((use.Def()->OperGet() == GT_LONG) && !use.IsDummyUse() && (use.User()->OperGet() == GT_FIELD_LIST))
if (use.Def()->OperIs(GT_LONG) && !use.IsDummyUse() && use.User()->OperIs(GT_FIELD_LIST))
{
DecomposeFieldList(use.User()->AsFieldList(), use.Def()->AsOp());
}

#ifdef DEBUG
if (m_compiler->verbose)
{
// NOTE: st_lcl_var doesn't dump properly afterwards.
printf("Decomposing TYP_LONG tree. AFTER:\n");
m_compiler->gtDispTreeRange(Range(), use.Def());
}
#endif
// NOTE: st_lcl_var doesn't dump properly afterwards.
JITDUMP("Decomposing TYP_LONG tree. AFTER:\n");
DISPTREERANGE(Range(), use.Def());

// When casting from a decomposed long to a smaller integer we can discard the high part.
if (m_compiler->opts.OptimizationEnabled() && !use.IsDummyUse() && use.User()->OperIs(GT_CAST) &&
Expand Down Expand Up @@ -1707,6 +1734,13 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree);
}

case NI_Vector128_ToScalar:
case NI_Vector256_ToScalar:
case NI_Vector512_ToScalar:
{
return DecomposeHWIntrinsicToScalar(use, hwintrinsicTree);
}

case NI_EVEX_MoveMask:
{
return DecomposeHWIntrinsicMoveMask(use, hwintrinsicTree);
Expand Down Expand Up @@ -1751,9 +1785,7 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
{
assert(node == use.Def());
assert(varTypeIsLong(node));
assert((node->GetHWIntrinsicId() == NI_Vector128_GetElement) ||
(node->GetHWIntrinsicId() == NI_Vector256_GetElement) ||
(node->GetHWIntrinsicId() == NI_Vector512_GetElement));
assert(HWIntrinsicInfo::IsVectorGetElement(node->GetHWIntrinsicId()));

GenTree* op1 = node->Op(1);
GenTree* op2 = node->Op(2);
Expand Down Expand Up @@ -1835,6 +1867,75 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
}

//------------------------------------------------------------------------
// DecomposeHWIntrinsicToScalar: Decompose GT_HWINTRINSIC -- NI_Vector*_ToScalar.
//
// create:
//
// tmp_simd_var = simd_var
// lo_result = GT_HWINTRINSIC{ToScalar}[int](tmp_simd_var)
// hi_result = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, 1)
// - or -
// GT_HWINTRINSIC{ToScalar}[int](GT_RSZ(tmp_simd_var, 32))
// return: GT_LONG(lo_result, hi_result)
//
// Arguments:
// use - the LIR::Use object for the def that needs to be decomposed.
// node - the hwintrinsic node to decompose
//
// Return Value:
// The GT_LONG node wrapping the upper and lower halves.
//
GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node)
{
assert(node == use.Def());
assert(varTypeIsLong(node));
assert(HWIntrinsicInfo::IsVectorToScalar(node->GetHWIntrinsicId()));

GenTree* op1 = node->Op(1);
NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
var_types simdBaseType = node->GetSimdBaseType();
unsigned simdSize = node->GetSimdSize();

assert(varTypeIsLong(simdBaseType));
assert(varTypeIsSIMD(op1));

GenTree* simdTmpVar = RepresentOpAsLocalVar(op1, node, &node->Op(1));
unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum();
JITDUMP("[DecomposeHWIntrinsicToScalar]: Saving op1 tree to a temp var:\n");
DISPTREERANGE(Range(), simdTmpVar);

GenTree* loResult = m_compiler->gtNewSimdToScalarNode(TYP_INT, simdTmpVar, CORINFO_TYPE_INT, simdSize);
Range().InsertAfter(simdTmpVar, loResult);

simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet());
Range().InsertAfter(loResult, simdTmpVar);

GenTree* hiResult;
if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
GenTree* one = m_compiler->gtNewIconNode(1);
hiResult = m_compiler->gtNewSimdGetElementNode(TYP_INT, simdTmpVar, one, CORINFO_TYPE_INT, simdSize);

Range().InsertAfter(simdTmpVar, one, hiResult);
}
else
{
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_SSE2));

GenTree* thirtyTwo = m_compiler->gtNewIconNode(32);
GenTree* shift = m_compiler->gtNewSimdBinOpNode(GT_RSZ, op1->TypeGet(), simdTmpVar, thirtyTwo,
node->GetSimdBaseJitType(), simdSize);
hiResult = m_compiler->gtNewSimdToScalarNode(TYP_INT, shift, CORINFO_TYPE_INT, simdSize);

Range().InsertAfter(simdTmpVar, thirtyTwo, shift, hiResult);
}

Range().Remove(node);

return FinalizeDecomposition(use, loResult, hiResult, hiResult);
}

//------------------------------------------------------------------------
// DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask
//
Expand Down Expand Up @@ -2262,6 +2363,13 @@ void DecomposeLongs::TryPromoteLongVar(unsigned lclNum)
{
return;
}
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
if (varDsc->lvIsParam)
{
// Promotion blocks combined read optimizations for SIMD loads of long params
return;
}
Comment on lines +2367 to +2371
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In isolation, this change produced a small number of diffs and was mostly an improvement. A few regressions show up in the SPMI reports, but the overall impact is good, especially considering the places we can load a long to vector with movq

#endif // FEATURE_HW_INTRINSICS && TARGET_X86

varDsc->lvFieldCnt = 2;
varDsc->lvFieldLclStart = m_compiler->lvaCount;
Expand Down
8 changes: 6 additions & 2 deletions src/coreclr/jit/decomposelongs.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,21 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#define _DECOMPOSELONGS_H_

#include "compiler.h"
#include "lower.h"

class DecomposeLongs
{
public:
DecomposeLongs(Compiler* compiler)
DecomposeLongs(Compiler* compiler, Lowering* lowering)
: m_compiler(compiler)
, m_lowering(lowering)
{
}

void PrepareForDecomposition();
void DecomposeBlock(BasicBlock* block);

static void DecomposeRange(Compiler* compiler, LIR::Range& range);
static void DecomposeRange(Compiler* compiler, Lowering* lowering, LIR::Range& range);

private:
inline LIR::Range& Range() const
Expand Down Expand Up @@ -64,6 +66,7 @@ class DecomposeLongs
#ifdef FEATURE_HW_INTRINSICS
GenTree* DecomposeHWIntrinsic(LIR::Use& use);
GenTree* DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node);
GenTree* DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node);
GenTree* DecomposeHWIntrinsicMoveMask(LIR::Use& use, GenTreeHWIntrinsic* node);
#endif // FEATURE_HW_INTRINSICS

Expand All @@ -80,6 +83,7 @@ class DecomposeLongs

// Data
Compiler* m_compiler;
Lowering* m_lowering;
LIR::Range* m_range;
};

Expand Down
21 changes: 15 additions & 6 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20772,22 +20772,31 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
break;

case INS_movd:
case INS_movq: // only MOVQ xmm, xmm is different (emitted by Sse2.MoveScalar, should use MOVDQU instead)
case INS_movq:
if (memAccessKind == PERFSCORE_MEMORY_NONE)
{
// movd r32, xmm or xmm, r32
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency = PERFSCORE_LATENCY_3C;
if (isFloatReg(id->idReg1()) && isFloatReg(id->idReg2()))
{
// movq xmm, xmm
result.insThroughput = PERFSCORE_THROUGHPUT_3X;
result.insLatency = PERFSCORE_LATENCY_1C;
}
else
{
// movd r32/64, xmm or xmm, r32/64
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency = PERFSCORE_LATENCY_3C;
}
}
else if (memAccessKind == PERFSCORE_MEMORY_READ)
{
// movd xmm, m32
// ins xmm, m32/64
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += PERFSCORE_LATENCY_2C;
}
else
{
// movd m32, xmm
// ins m32/64, xmm
assert(memAccessKind == PERFSCORE_MEMORY_WRITE);
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_2C;
Expand Down
Loading
Loading