Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3903,7 +3903,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
continue;

VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
*CM.PSE.getSE());
*CM.PSE.getSE(), OrigLoop);
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
Expand Down Expand Up @@ -4161,7 +4161,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
// Add on other costs that are modelled in VPlan, but not in the legacy
// cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
*CM.PSE.getSE());
*CM.PSE.getSE(), OrigLoop);
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
assert(VectorRegion && "Expected to have a vector region!");
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
Expand Down Expand Up @@ -6836,7 +6836,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,

InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE(),
OrigLoop);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);

// Now compute and add the VPlan-based cost.
Expand Down Expand Up @@ -7070,7 +7071,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
*CM.PSE.getSE());
*CM.PSE.getSE(), OrigLoop);
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
Expand Down Expand Up @@ -8601,7 +8602,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
*CM.PSE.getSE());
*CM.PSE.getSE(), OrigLoop);
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}
Expand Down Expand Up @@ -10058,7 +10059,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
CM.CostKind, *CM.PSE.getSE());
CM.CostKind, *CM.PSE.getSE(), L);
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
LVP.getPlanFor(VF.Width), SEL,
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -350,13 +350,14 @@ struct VPCostContext {
SmallPtrSet<Instruction *, 8> SkipCostComputation;
TargetTransformInfo::TargetCostKind CostKind;
ScalarEvolution &SE;
const Loop *L;

VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
const VPlan &Plan, LoopVectorizationCostModel &CM,
TargetTransformInfo::TargetCostKind CostKind,
ScalarEvolution &SE)
ScalarEvolution &SE, const Loop *L)
: TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
CostKind(CostKind), SE(SE) {}
CostKind(CostKind), SE(SE), L(L) {}

/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
Expand Down
19 changes: 9 additions & 10 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3071,23 +3071,24 @@ bool VPReplicateRecipe::shouldPack() const {

/// Returns true if \p Ptr is a pointer computation for which the legacy cost
/// model computes a SCEV expression when computing the address cost.
static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused about why this function is necessary at all? Can the functionality not be absorbed into vputils::getSCEVExprForVPValue?

const Loop *L) {
auto *PtrR = Ptr->getDefiningRecipe();
if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
Instruction::GetElementPtr) ||
isa<VPWidenGEPRecipe>(PtrR)))
return false;
return nullptr;

// We are looking for a GEP where all indices are either loop invariant or
// inductions.
for (VPValue *Opd : drop_begin(PtrR->operands())) {
if (!Opd->isDefinedOutsideLoopRegions() &&
!isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
return false;
return nullptr;
Comment on lines -3080 to +3088
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I'm confused about why we're returning nullptr at times, and SCEVCouldNotCompute at other times?

}

return true;
return vputils::getSCEVExprForVPValue(Ptr, SE, L);
}

/// Returns true if \p V is used as part of the address of another load or
Expand Down Expand Up @@ -3242,11 +3243,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,

bool IsLoad = UI->getOpcode() == Instruction::Load;
const VPValue *PtrOp = getOperand(!IsLoad);
// TODO: Handle cases where we need to pass a SCEV to
// getAddressComputationCost.
if (shouldUseAddressAccessSCEV(PtrOp))
break;

Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
const Align Alignment = getLoadStoreAlignment(UI);
Expand All @@ -3257,9 +3253,12 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,

Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);

const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
if (PtrSCEV && isa<SCEVCouldNotCompute>(PtrSCEV))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (PtrSCEV && isa<SCEVCouldNotCompute>(PtrSCEV))
if (isa<SCEVCouldNotCompute>(PtrSCEV))

after forbidding null returns.

break;
InstructionCost ScalarCost =
ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
PtrTy, &Ctx.SE, PtrSCEV, Ctx.CostKind);
if (isSingleScalar())
return ScalarCost;

Expand Down
84 changes: 83 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
IsWideCanonicalIV(A) && B == Plan.getOrCreateBackedgeTakenCount();
}

const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
ScalarEvolution &SE, const Loop *L) {
if (V->isLiveIn()) {
if (Value *LiveIn = V->getLiveInIRValue())
return SE.getSCEV(LiveIn);
Expand All @@ -86,6 +87,87 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
return TypeSwitch<const VPRecipeBase *, const SCEV *>(V->getDefiningRecipe())
.Case<VPExpandSCEVRecipe>(
[](const VPExpandSCEVRecipe *R) { return R->getSCEV(); })
.Case<VPCanonicalIVPHIRecipe>([&SE, L](const VPCanonicalIVPHIRecipe *R) {
if (!L)
return SE.getCouldNotCompute();
const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
return SE.getAddRecExpr(Start, SE.getOne(Start->getType()), L,
SCEV::FlagAnyWrap);
})
.Case<VPDerivedIVRecipe>([&SE, L](const VPDerivedIVRecipe *R) {
const SCEV *Start = getSCEVExprForVPValue(R->getOperand(0), SE, L);
const SCEV *IV = getSCEVExprForVPValue(R->getOperand(1), SE, L);
const SCEV *Scale = getSCEVExprForVPValue(R->getOperand(2), SE, L);
if (isa<SCEVCouldNotCompute>(Start) || isa<SCEVCouldNotCompute>(IV) ||
isa<SCEVCouldNotCompute>(Scale))
return SE.getCouldNotCompute();
Comment on lines +98 to +103
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could simplify this with a quick stl-algorithm call?


return SE.getAddExpr(SE.getTruncateOrSignExtend(Start, IV->getType()),
SE.getMulExpr(IV, SE.getTruncateOrSignExtend(
Scale, IV->getType())));
Comment on lines +105 to +107
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quick note: Start and Scale are possibly-negative values, which necessitates a sext, right?

})
.Case<VPScalarIVStepsRecipe>([&SE, L](const VPScalarIVStepsRecipe *R) {
return getSCEVExprForVPValue(R->getOperand(0), SE, L);
})
.Case<VPReplicateRecipe>([&SE, L](const VPReplicateRecipe *R) {
if (R->getOpcode() != Instruction::GetElementPtr)
return SE.getCouldNotCompute();

const SCEV *Base = getSCEVExprForVPValue(R->getOperand(0), SE, L);
if (isa<SCEVCouldNotCompute>(Base))
return SE.getCouldNotCompute();

Type *IntIdxTy = SE.getEffectiveSCEVType(Base->getType());
Type *CurTy = IntIdxTy;
bool FirstIter = true;
SmallVector<const SCEV *, 4> Offsets;
for (VPValue *Index : drop_begin(R->operands())) {
const SCEV *IndexExpr = getSCEVExprForVPValue(Index, SE, L);
if (isa<SCEVCouldNotCompute>(IndexExpr))
return SE.getCouldNotCompute();
// Compute the (potentially symbolic) offset in bytes for this index.
if (StructType *STy = dyn_cast<StructType>(CurTy)) {
// For a struct, add the member offset.
ConstantInt *Index = cast<SCEVConstant>(IndexExpr)->getValue();
unsigned FieldNo = Index->getZExtValue();
const SCEV *FieldOffset =
SE.getOffsetOfExpr(IntIdxTy, STy, FieldNo);
Offsets.push_back(FieldOffset);

// Update CurTy to the type of the field at Index.
CurTy = STy->getTypeAtIndex(Index);
} else {
// Update CurTy to its element type.
if (FirstIter) {
CurTy = cast<GetElementPtrInst>(R->getUnderlyingInstr())
->getSourceElementType();
FirstIter = false;
} else {
CurTy = GetElementPtrInst::getTypeAtIndex(CurTy, (uint64_t)0);
}
// For an array, add the element offset, explicitly scaled.
const SCEV *ElementSize = SE.getSizeOfExpr(IntIdxTy, CurTy);
// Getelementptr indices are signed.
IndexExpr = SE.getTruncateOrSignExtend(IndexExpr, IntIdxTy);

// Multiply the index by the element size to compute the element
// offset.
const SCEV *LocalOffset = SE.getMulExpr(IndexExpr, ElementSize);
Offsets.push_back(LocalOffset);
}
}
// Handle degenerate case of GEP without offsets.
if (Offsets.empty())
return Base;

// Add the offsets together, assuming nsw if inbounds.
const SCEV *Offset = SE.getAddExpr(Offsets);
// Add the base address and the offset. We cannot use the nsw flag, as
// the base address is unsigned. However, if we know that the offset is
// non-negative, we can use nuw.
return SE.getAddExpr(Base, Offset);
})

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra newline?

.Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
}

Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr);

/// Return the SCEV expression for \p V. Returns SCEVCouldNotCompute if no
/// SCEV expression could be constructed.
const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
const SCEV *getSCEVExprForVPValue(const VPValue *V, ScalarEvolution &SE,
const Loop *L = nullptr);

/// Returns true if \p VPV is a single scalar, either because it produces the
/// same value for all lanes or only has its first lane used.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -463,21 +463,21 @@ define void @test_prefer_vector_addressing(ptr %start, ptr %ms, ptr noalias %src
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP11]]
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP12]]
; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP13]]
; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[NEXT_GEP]], align 1, !tbaa [[LONG_LONG_TBAA14:![0-9]+]]
; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[NEXT_GEP3]], align 1, !tbaa [[LONG_LONG_TBAA14]]
; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[NEXT_GEP4]], align 1, !tbaa [[LONG_LONG_TBAA14]]
; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[NEXT_GEP5]], align 1, !tbaa [[LONG_LONG_TBAA14]]
; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[NEXT_GEP]], align 1, !tbaa [[LONG_LONG_TBAA12:![0-9]+]]
; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[NEXT_GEP3]], align 1, !tbaa [[LONG_LONG_TBAA12]]
; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[NEXT_GEP4]], align 1, !tbaa [[LONG_LONG_TBAA12]]
; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[NEXT_GEP5]], align 1, !tbaa [[LONG_LONG_TBAA12]]
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]]
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
; CHECK-NEXT: store i32 0, ptr [[TMP18]], align 4, !tbaa [[INT_TBAA19:![0-9]+]]
; CHECK-NEXT: store i32 0, ptr [[TMP19]], align 4, !tbaa [[INT_TBAA19]]
; CHECK-NEXT: store i32 0, ptr [[TMP20]], align 4, !tbaa [[INT_TBAA19]]
; CHECK-NEXT: store i32 0, ptr [[TMP21]], align 4, !tbaa [[INT_TBAA19]]
; CHECK-NEXT: store i32 0, ptr [[TMP18]], align 4, !tbaa [[INT_TBAA17:![0-9]+]]
; CHECK-NEXT: store i32 0, ptr [[TMP19]], align 4, !tbaa [[INT_TBAA17]]
; CHECK-NEXT: store i32 0, ptr [[TMP20]], align 4, !tbaa [[INT_TBAA17]]
; CHECK-NEXT: store i32 0, ptr [[TMP21]], align 4, !tbaa [[INT_TBAA17]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
Comment on lines -466 to +480
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could revert to minimize changes?

; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
Expand Down Expand Up @@ -543,45 +543,36 @@ exit:
define double @test_scalarization_cost_for_load_of_address(ptr %src.0, ptr %src.1, ptr %src.2) {
; CHECK-LABEL: define double @test_scalarization_cost_for_load_of_address(
; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], ptr [[SRC_2:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi double [ 3.000000e+00, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[RED:%.*]] = phi double [ 3.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr [[T:%.*]], ptr [[SRC_0]], i64 [[IV]]
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <6 x double>, ptr [[GEP_0]], align 8
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <6 x double> [[WIDE_VEC]], <6 x double> poison, <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <6 x double> [[WIDE_VEC]], <6 x double> poison, <2 x i32> <i32 1, i32 4>
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <6 x double> [[WIDE_VEC]], <6 x double> poison, <2 x i32> <i32 2, i32 5>
; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[STRIDED_VEC]], splat (double 3.000000e+00)
; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[STRIDED_VEC1]], splat (double 3.000000e+00)
; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[STRIDED_VEC2]], splat (double 3.000000e+00)
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[L_0:%.*]] = load double, ptr [[GEP_0]], align 8
; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr i8, ptr [[GEP_0]], i64 8
; CHECK-NEXT: [[L_1:%.*]] = load double, ptr [[GEP_8]], align 8
; CHECK-NEXT: [[GEP_16:%.*]] = getelementptr i8, ptr [[GEP_0]], i64 16
; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[GEP_16]], align 8
; CHECK-NEXT: [[MUL_0:%.*]] = fmul double [[L_0]], 3.000000e+00
; CHECK-NEXT: [[MUL_1:%.*]] = fmul double [[L_1]], 3.000000e+00
; CHECK-NEXT: [[MUL_2:%.*]] = fmul double [[L_2]], 3.000000e+00
; CHECK-NEXT: [[ADD_0:%.*]] = fadd double [[MUL_0]], [[MUL_1]]
; CHECK-NEXT: [[ADD_1:%.*]] = fadd double [[ADD_0]], [[MUL_2]]
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[IV]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[GEP_SRC]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP7]], [[WIDE_LOAD]]
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr [[T_2:%.*]], ptr [[SRC_2]], i64 [[IV]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr [[T_2]], ptr [[SRC_2]], i64 [[TMP1]]
; CHECK-NEXT: [[GEP_72:%.*]] = getelementptr i8, ptr [[GEP_SRC_2]], i64 72
; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP_SRC]], align 8
; CHECK-NEXT: [[MUL256_US:%.*]] = fmul double [[ADD_1]], [[L]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr [[T_2:%.*]], ptr [[SRC_2]], i64 [[IV]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP11]], i64 72
; CHECK-NEXT: [[L_P_2:%.*]] = load ptr, ptr [[GEP_72]], align 8
; CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TMP13]], align 8
; CHECK-NEXT: [[LV:%.*]] = load double, ptr [[L_P_2]], align 8
; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[TMP15]], align 8
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x double> poison, double [[LV]], i32 0
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> [[TMP18]], double [[TMP17]], i32 1
; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP9]], [[TMP19]]
; CHECK-NEXT: [[TMP21]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI]], <2 x double> [[TMP20]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br [[EXIT:label %.*]]
; CHECK: [[SCALAR_PH:.*:]]
; CHECK-NEXT: [[RED_NEXT]] = tail call double @llvm.fmuladd.f64(double [[MUL256_US]], double [[TMP17]], double [[RED]])
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1
; CHECK-NEXT: br i1 [[EC]], label %[[SCALAR_PH:.*]], label %[[LOOP]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi double [ [[RED_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: ret double [[RED_NEXT_LCSSA]]
;
entry:
br label %loop
Expand Down