Skip to content

Commit fc6997e

Browse files
committed
[LV] Avoid SCEVChecks when IV update doesn't overflow
We already check when IV update overflow checks are needed in different places in LV: consolidate them into a single routine, and re-use it to conditionally drop SCEVChecks in GeneratedRTChecks.
1 parent 8c21064 commit fc6997e

File tree

55 files changed

+829
-1911
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+829
-1911
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 76 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1766,6 +1766,64 @@ class LoopVectorizationCostModel {
17661766
};
17671767
} // end namespace llvm
17681768

1769+
static std::optional<unsigned> getMaxVScale(const Function &F,
1770+
const TargetTransformInfo &TTI) {
1771+
if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
1772+
return MaxVScale;
1773+
1774+
if (F.hasFnAttribute(Attribute::VScaleRange))
1775+
return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
1776+
1777+
return std::nullopt;
1778+
}
1779+
1780+
/// For the given VF and UF and maximum trip count computed for the loop, return
1781+
/// whether the induction variable might overflow in the vectorized loop. If
1782+
/// not, then we know a runtime overflow check always evaluates to false and can
1783+
/// be removed.
1784+
static bool
1785+
isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost,
1786+
ElementCount VF,
1787+
std::optional<unsigned> UF = std::nullopt) {
1788+
// Always be conservative if we don't know the exact unroll factor.
1789+
unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
1790+
1791+
IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
1792+
APInt MaxUIntTripCount = IdxTy->getMask();
1793+
1794+
// We know the runtime overflow check is known false iff the (max) trip-count
1795+
// is known and (max) trip-count + (VF * UF) does not overflow in the type of
1796+
// the vector loop induction variable.
1797+
if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
1798+
uint64_t MaxVF = VF.getKnownMinValue();
1799+
if (VF.isScalable()) {
1800+
std::optional<unsigned> MaxVScale =
1801+
getMaxVScale(*Cost->TheFunction, Cost->TTI);
1802+
if (!MaxVScale)
1803+
return false;
1804+
MaxVF *= *MaxVScale;
1805+
}
1806+
1807+
return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
1808+
}
1809+
1810+
return false;
1811+
}
1812+
1813+
/// Checks whether an IndVar overflow check is needed using
1814+
/// isIndvarOverflowCheckKnownFalse, with additional information about the
1815+
/// tail-folding style.
1816+
static bool isIndvarOverflowCheckNeeded(const LoopVectorizationCostModel &CM,
1817+
ElementCount VF, unsigned IC) {
1818+
// vscale is not necessarily a power-of-2, which means we cannot guarantee
1819+
// an overflow to zero when updating induction variables and so an
1820+
// additional overflow check is required before entering the vector loop.
1821+
return VF.isScalable() && !CM.TTI.isVScaleKnownToBeAPowerOfTwo() &&
1822+
!isIndvarOverflowCheckKnownFalse(&CM, VF, IC) &&
1823+
CM.getTailFoldingStyle() !=
1824+
TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
1825+
}
1826+
17691827
namespace {
17701828
/// Helper struct to manage generating runtime checks for vectorization.
17711829
///
@@ -1801,17 +1859,17 @@ class GeneratedRTChecks {
18011859

18021860
PredicatedScalarEvolution &PSE;
18031861

1804-
/// The kind of cost that we are calculating
1805-
TTI::TargetCostKind CostKind;
1862+
/// The CostModel.
1863+
const LoopVectorizationCostModel &CM;
18061864

18071865
public:
18081866
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
18091867
LoopInfo *LI, TargetTransformInfo *TTI,
1810-
const DataLayout &DL, TTI::TargetCostKind CostKind)
1868+
const DataLayout &DL, LoopVectorizationCostModel &CM)
18111869
: DT(DT), LI(LI), TTI(TTI),
18121870
SCEVExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
18131871
MemCheckExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
1814-
PSE(PSE), CostKind(CostKind) {}
1872+
PSE(PSE), CM(CM) {}
18151873

18161874
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
18171875
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -1833,11 +1891,18 @@ class GeneratedRTChecks {
18331891
BasicBlock *LoopHeader = L->getHeader();
18341892
BasicBlock *Preheader = L->getLoopPreheader();
18351893

1894+
// SCEVChecks are droppable when the UnionPred is always true, or when
1895+
// IndVar overflow checks are not needed, under the condition that we don't
1896+
// drop stride-versioning checks.
1897+
bool SCEVChecksAreDroppable =
1898+
UnionPred.isAlwaysTrue() || (!isIndvarOverflowCheckNeeded(CM, VF, IC) &&
1899+
LAI.getSymbolicStrides().empty());
1900+
18361901
// Use SplitBlock to create blocks for SCEV & memory runtime checks to
18371902
// ensure the blocks are properly added to LoopInfo & DominatorTree. Those
18381903
// may be used by SCEVExpander. The blocks will be un-linked from their
18391904
// predecessors and removed from LI & DT at the end of the function.
1840-
if (!UnionPred.isAlwaysTrue()) {
1905+
if (!SCEVChecksAreDroppable) {
18411906
SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
18421907
nullptr, "vector.scevcheck");
18431908

@@ -1935,7 +2000,7 @@ class GeneratedRTChecks {
19352000
for (Instruction &I : *SCEVCheckBlock) {
19362001
if (SCEVCheckBlock->getTerminator() == &I)
19372002
continue;
1938-
InstructionCost C = TTI->getInstructionCost(&I, CostKind);
2003+
InstructionCost C = TTI->getInstructionCost(&I, CM.CostKind);
19392004
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
19402005
RTCheckCost += C;
19412006
}
@@ -1944,7 +2009,7 @@ class GeneratedRTChecks {
19442009
for (Instruction &I : *MemCheckBlock) {
19452010
if (MemCheckBlock->getTerminator() == &I)
19462011
continue;
1947-
InstructionCost C = TTI->getInstructionCost(&I, CostKind);
2012+
InstructionCost C = TTI->getInstructionCost(&I, CM.CostKind);
19482013
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
19492014
MemCheckCost += C;
19502015
}
@@ -2222,49 +2287,6 @@ emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
22222287
llvm_unreachable("invalid enum");
22232288
}
22242289

2225-
static std::optional<unsigned> getMaxVScale(const Function &F,
2226-
const TargetTransformInfo &TTI) {
2227-
if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2228-
return MaxVScale;
2229-
2230-
if (F.hasFnAttribute(Attribute::VScaleRange))
2231-
return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2232-
2233-
return std::nullopt;
2234-
}
2235-
2236-
/// For the given VF and UF and maximum trip count computed for the loop, return
2237-
/// whether the induction variable might overflow in the vectorized loop. If not,
2238-
/// then we know a runtime overflow check always evaluates to false and can be
2239-
/// removed.
2240-
static bool isIndvarOverflowCheckKnownFalse(
2241-
const LoopVectorizationCostModel *Cost,
2242-
ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2243-
// Always be conservative if we don't know the exact unroll factor.
2244-
unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2245-
2246-
IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2247-
APInt MaxUIntTripCount = IdxTy->getMask();
2248-
2249-
// We know the runtime overflow check is known false iff the (max) trip-count
2250-
// is known and (max) trip-count + (VF * UF) does not overflow in the type of
2251-
// the vector loop induction variable.
2252-
if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2253-
uint64_t MaxVF = VF.getKnownMinValue();
2254-
if (VF.isScalable()) {
2255-
std::optional<unsigned> MaxVScale =
2256-
getMaxVScale(*Cost->TheFunction, Cost->TTI);
2257-
if (!MaxVScale)
2258-
return false;
2259-
MaxVF *= *MaxVScale;
2260-
}
2261-
2262-
return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2263-
}
2264-
2265-
return false;
2266-
}
2267-
22682290
// Return whether we allow using masked interleave-groups (for dealing with
22692291
// strided loads/stores that reside in predicated blocks, or for dealing
22702292
// with gaps).
@@ -2354,13 +2376,7 @@ Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
23542376
// check is known to be true, or known to be false.
23552377
CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
23562378
} // else step known to be < trip count, use CheckMinIters preset to false.
2357-
} else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2358-
!isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2359-
Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2360-
// vscale is not necessarily a power-of-2, which means we cannot guarantee
2361-
// an overflow to zero when updating induction variables and so an
2362-
// additional overflow check is required before entering the vector loop.
2363-
2379+
} else if (isIndvarOverflowCheckNeeded(*Cost, VF, UF)) {
23642380
// Get the maximum unsigned value for the type.
23652381
Value *MaxUIntTripCount =
23662382
ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
@@ -9122,22 +9138,14 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
91229138
void LoopVectorizationPlanner::addMinimumIterationCheck(
91239139
VPlan &Plan, ElementCount VF, unsigned UF,
91249140
ElementCount MinProfitableTripCount) const {
9125-
// vscale is not necessarily a power-of-2, which means we cannot guarantee
9126-
// an overflow to zero when updating induction variables and so an
9127-
// additional overflow check is required before entering the vector loop.
9128-
bool IsIndvarOverflowCheckNeededForVF =
9129-
VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
9130-
!isIndvarOverflowCheckKnownFalse(&CM, VF, UF) &&
9131-
CM.getTailFoldingStyle() !=
9132-
TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
91339141
const uint32_t *BranchWeigths =
91349142
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
91359143
? &MinItersBypassWeights[0]
91369144
: nullptr;
91379145
VPlanTransforms::addMinimumIterationCheck(
91389146
Plan, VF, UF, MinProfitableTripCount,
91399147
CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
9140-
IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths,
9148+
isIndvarOverflowCheckNeeded(CM, VF, UF), OrigLoop, BranchWeigths,
91419149
OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
91429150
*PSE.getSE());
91439151
}
@@ -9249,7 +9257,7 @@ static bool processLoopInVPlanNativePath(
92499257
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
92509258

92519259
{
9252-
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9260+
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM);
92539261
InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
92549262
Checks, BestPlan);
92559263
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10085,7 +10093,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1008510093
if (ORE->allowExtraAnalysis(LV_NAME))
1008610094
LVP.emitInvalidCostRemarks(ORE);
1008710095

10088-
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
10096+
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM);
1008910097
if (LVP.hasPlanWithVF(VF.Width)) {
1009010098
// Select the interleave count.
1009110099
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);

llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010

1111

1212
; CHECK-LABEL: test_sge
13-
; CHECK-LABEL: vector.scevcheck
14-
; CHECK-LABEL: vector.body
13+
; CHECK-NOT: vector.scevcheck
14+
; CHECK: vector.body
1515
define void @test_sge(ptr noalias %A,
1616
ptr noalias %B,
1717
ptr noalias %C, i32 %N) {
@@ -48,8 +48,8 @@ for.end:
4848
}
4949

5050
; CHECK-LABEL: test_uge
51-
; CHECK-LABEL: vector.scevcheck
52-
; CHECK-LABEL: vector.body
51+
; CHECK-NOT: vector.scevcheck
52+
; CHECK: vector.body
5353
define void @test_uge(ptr noalias %A,
5454
ptr noalias %B,
5555
ptr noalias %C, i32 %N, i32 %Offset) {
@@ -88,8 +88,8 @@ for.end:
8888
}
8989

9090
; CHECK-LABEL: test_ule
91-
; CHECK-LABEL: vector.scevcheck
92-
; CHECK-LABEL: vector.body
91+
; CHECK-NOT: vector.scevcheck
92+
; CHECK: vector.body
9393
define void @test_ule(ptr noalias %A,
9494
ptr noalias %B,
9595
ptr noalias %C, i32 %N,
@@ -127,8 +127,8 @@ for.end:
127127
}
128128

129129
; CHECK-LABEL: test_sle
130-
; CHECK-LABEL: vector.scevcheck
131-
; CHECK-LABEL: vector.body
130+
; CHECK-NOT: vector.scevcheck
131+
; CHECK: vector.body
132132
define void @test_sle(ptr noalias %A,
133133
ptr noalias %B,
134134
ptr noalias %C, i32 %N,

0 commit comments

Comments
 (0)