@@ -1766,6 +1766,64 @@ class LoopVectorizationCostModel {
17661766};
17671767} // end namespace llvm
17681768
1769+ static std::optional<unsigned > getMaxVScale (const Function &F,
1770+ const TargetTransformInfo &TTI) {
1771+ if (std::optional<unsigned > MaxVScale = TTI.getMaxVScale ())
1772+ return MaxVScale;
1773+
1774+ if (F.hasFnAttribute (Attribute::VScaleRange))
1775+ return F.getFnAttribute (Attribute::VScaleRange).getVScaleRangeMax ();
1776+
1777+ return std::nullopt ;
1778+ }
1779+
1780+ // / For the given VF and UF and maximum trip count computed for the loop, return
1781+ // / whether the induction variable might overflow in the vectorized loop. If
1782+ // / not, then we know a runtime overflow check always evaluates to false and can
1783+ // / be removed.
1784+ static bool
1785+ isIndvarOverflowCheckKnownFalse (const LoopVectorizationCostModel *Cost,
1786+ ElementCount VF,
1787+ std::optional<unsigned > UF = std::nullopt ) {
1788+ // Always be conservative if we don't know the exact unroll factor.
1789+ unsigned MaxUF = UF ? *UF : Cost->TTI .getMaxInterleaveFactor (VF);
1790+
1791+ IntegerType *IdxTy = Cost->Legal ->getWidestInductionType ();
1792+ APInt MaxUIntTripCount = IdxTy->getMask ();
1793+
1794+ // We know the runtime overflow check is known false iff the (max) trip-count
1795+ // is known and (max) trip-count + (VF * UF) does not overflow in the type of
1796+ // the vector loop induction variable.
1797+ if (unsigned TC = Cost->PSE .getSmallConstantMaxTripCount ()) {
1798+ uint64_t MaxVF = VF.getKnownMinValue ();
1799+ if (VF.isScalable ()) {
1800+ std::optional<unsigned > MaxVScale =
1801+ getMaxVScale (*Cost->TheFunction , Cost->TTI );
1802+ if (!MaxVScale)
1803+ return false ;
1804+ MaxVF *= *MaxVScale;
1805+ }
1806+
1807+ return (MaxUIntTripCount - TC).ugt (MaxVF * MaxUF);
1808+ }
1809+
1810+ return false ;
1811+ }
1812+
1813+ // / Checks whether an IndVar overflow check is needed using
1814+ // / isIndvarOverflowCheckKnownFalse, with additional information about the
1815+ // / tail-folding style.
1816+ static bool isIndvarOverflowCheckNeeded (const LoopVectorizationCostModel &CM,
1817+ ElementCount VF, unsigned IC) {
1818+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
1819+ // an overflow to zero when updating induction variables and so an
1820+ // additional overflow check is required before entering the vector loop.
1821+ return VF.isScalable () && !CM.TTI .isVScaleKnownToBeAPowerOfTwo () &&
1822+ !isIndvarOverflowCheckKnownFalse (&CM, VF, IC) &&
1823+ CM.getTailFoldingStyle () !=
1824+ TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
1825+ }
1826+
17691827namespace {
17701828// / Helper struct to manage generating runtime checks for vectorization.
17711829// /
@@ -1790,7 +1848,6 @@ class GeneratedRTChecks {
17901848
17911849 DominatorTree *DT;
17921850 LoopInfo *LI;
1793- TargetTransformInfo *TTI;
17941851
17951852 SCEVExpander SCEVExp;
17961853 SCEVExpander MemCheckExp;
@@ -1801,17 +1858,16 @@ class GeneratedRTChecks {
18011858
18021859 PredicatedScalarEvolution &PSE;
18031860
1804- // / The kind of cost that we are calculating
1805- TTI::TargetCostKind CostKind ;
1861+ // / The CostModel.
1862+ const LoopVectorizationCostModel &CM ;
18061863
18071864public:
18081865 GeneratedRTChecks (PredicatedScalarEvolution &PSE, DominatorTree *DT,
1809- LoopInfo *LI, TargetTransformInfo *TTI,
1810- TTI::TargetCostKind CostKind)
1811- : DT(DT), LI(LI), TTI(TTI),
1866+ LoopInfo *LI, LoopVectorizationCostModel &CM)
1867+ : DT(DT), LI(LI),
18121868 SCEVExp (*PSE.getSE(), "scev.check", /* PreserveLCSSA=*/ false),
18131869 MemCheckExp(*PSE.getSE(), "scev.check", /* PreserveLCSSA=*/ false),
1814- PSE(PSE), CostKind(CostKind ) {}
1870+ PSE(PSE), CM(CM ) {}
18151871
18161872 // / Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
18171873 // / accurately estimate the cost of the runtime checks. The blocks are
@@ -1833,11 +1889,18 @@ class GeneratedRTChecks {
18331889 BasicBlock *LoopHeader = L->getHeader ();
18341890 BasicBlock *Preheader = L->getLoopPreheader ();
18351891
1892+ // SCEVChecks are droppable when the UnionPred is always true, or when
1893+ // IndVar overflow checks are not needed, under the condition that we don't
1894+ // drop stride-versioning checks.
1895+ bool SCEVChecksAreDroppable =
1896+ UnionPred.isAlwaysTrue () || (!isIndvarOverflowCheckNeeded (CM, VF, IC) &&
1897+ LAI.getSymbolicStrides ().empty ());
1898+
18361899 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
18371900 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
18381901 // may be used by SCEVExpander. The blocks will be un-linked from their
18391902 // predecessors and removed from LI & DT at the end of the function.
1840- if (!UnionPred. isAlwaysTrue () ) {
1903+ if (!SCEVChecksAreDroppable ) {
18411904 SCEVCheckBlock = SplitBlock (Preheader, Preheader->getTerminator (), DT, LI,
18421905 nullptr , " vector.scevcheck" );
18431906
@@ -1935,7 +1998,7 @@ class GeneratedRTChecks {
19351998 for (Instruction &I : *SCEVCheckBlock) {
19361999 if (SCEVCheckBlock->getTerminator () == &I)
19372000 continue ;
1938- InstructionCost C = TTI-> getInstructionCost (&I, CostKind);
2001+ InstructionCost C = CM. TTI . getInstructionCost (&I, CM. CostKind );
19392002 LLVM_DEBUG (dbgs () << " " << C << " for " << I << " \n " );
19402003 RTCheckCost += C;
19412004 }
@@ -1944,7 +2007,7 @@ class GeneratedRTChecks {
19442007 for (Instruction &I : *MemCheckBlock) {
19452008 if (MemCheckBlock->getTerminator () == &I)
19462009 continue ;
1947- InstructionCost C = TTI-> getInstructionCost (&I, CostKind);
2010+ InstructionCost C = CM. TTI . getInstructionCost (&I, CM. CostKind );
19482011 LLVM_DEBUG (dbgs () << " " << C << " for " << I << " \n " );
19492012 MemCheckCost += C;
19502013 }
@@ -2222,49 +2285,6 @@ emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
22222285 llvm_unreachable (" invalid enum" );
22232286}
22242287
2225- static std::optional<unsigned > getMaxVScale (const Function &F,
2226- const TargetTransformInfo &TTI) {
2227- if (std::optional<unsigned > MaxVScale = TTI.getMaxVScale ())
2228- return MaxVScale;
2229-
2230- if (F.hasFnAttribute (Attribute::VScaleRange))
2231- return F.getFnAttribute (Attribute::VScaleRange).getVScaleRangeMax ();
2232-
2233- return std::nullopt ;
2234- }
2235-
2236- // / For the given VF and UF and maximum trip count computed for the loop, return
2237- // / whether the induction variable might overflow in the vectorized loop. If not,
2238- // / then we know a runtime overflow check always evaluates to false and can be
2239- // / removed.
2240- static bool isIndvarOverflowCheckKnownFalse (
2241- const LoopVectorizationCostModel *Cost,
2242- ElementCount VF, std::optional<unsigned > UF = std::nullopt ) {
2243- // Always be conservative if we don't know the exact unroll factor.
2244- unsigned MaxUF = UF ? *UF : Cost->TTI .getMaxInterleaveFactor (VF);
2245-
2246- IntegerType *IdxTy = Cost->Legal ->getWidestInductionType ();
2247- APInt MaxUIntTripCount = IdxTy->getMask ();
2248-
2249- // We know the runtime overflow check is known false iff the (max) trip-count
2250- // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2251- // the vector loop induction variable.
2252- if (unsigned TC = Cost->PSE .getSmallConstantMaxTripCount ()) {
2253- uint64_t MaxVF = VF.getKnownMinValue ();
2254- if (VF.isScalable ()) {
2255- std::optional<unsigned > MaxVScale =
2256- getMaxVScale (*Cost->TheFunction , Cost->TTI );
2257- if (!MaxVScale)
2258- return false ;
2259- MaxVF *= *MaxVScale;
2260- }
2261-
2262- return (MaxUIntTripCount - TC).ugt (MaxVF * MaxUF);
2263- }
2264-
2265- return false ;
2266- }
2267-
22682288// Return whether we allow using masked interleave-groups (for dealing with
22692289// strided loads/stores that reside in predicated blocks, or for dealing
22702290// with gaps).
@@ -2354,13 +2374,7 @@ Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
23542374 // check is known to be true, or known to be false.
23552375 CheckMinIters = Builder.CreateICmp (P, Count, Step, " min.iters.check" );
23562376 } // else step known to be < trip count, use CheckMinIters preset to false.
2357- } else if (VF.isScalable () && !TTI->isVScaleKnownToBeAPowerOfTwo () &&
2358- !isIndvarOverflowCheckKnownFalse (Cost, VF, UF) &&
2359- Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2360- // vscale is not necessarily a power-of-2, which means we cannot guarantee
2361- // an overflow to zero when updating induction variables and so an
2362- // additional overflow check is required before entering the vector loop.
2363-
2377+ } else if (isIndvarOverflowCheckNeeded (*Cost, VF, UF)) {
23642378 // Get the maximum unsigned value for the type.
23652379 Value *MaxUIntTripCount =
23662380 ConstantInt::get (CountTy, cast<IntegerType>(CountTy)->getMask ());
@@ -9122,22 +9136,14 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
91229136void LoopVectorizationPlanner::addMinimumIterationCheck (
91239137 VPlan &Plan, ElementCount VF, unsigned UF,
91249138 ElementCount MinProfitableTripCount) const {
9125- // vscale is not necessarily a power-of-2, which means we cannot guarantee
9126- // an overflow to zero when updating induction variables and so an
9127- // additional overflow check is required before entering the vector loop.
9128- bool IsIndvarOverflowCheckNeededForVF =
9129- VF.isScalable () && !TTI.isVScaleKnownToBeAPowerOfTwo () &&
9130- !isIndvarOverflowCheckKnownFalse (&CM, VF, UF) &&
9131- CM.getTailFoldingStyle () !=
9132- TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
91339139 const uint32_t *BranchWeigths =
91349140 hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ())
91359141 ? &MinItersBypassWeights[0 ]
91369142 : nullptr ;
91379143 VPlanTransforms::addMinimumIterationCheck (
91389144 Plan, VF, UF, MinProfitableTripCount,
91399145 CM.requiresScalarEpilogue (VF.isVector ()), CM.foldTailByMasking (),
9140- IsIndvarOverflowCheckNeededForVF , OrigLoop, BranchWeigths,
9146+ isIndvarOverflowCheckNeeded (CM, VF, UF) , OrigLoop, BranchWeigths,
91419147 OrigLoop->getLoopPredecessor ()->getTerminator ()->getDebugLoc (),
91429148 *PSE.getSE ());
91439149}
@@ -9249,7 +9255,7 @@ static bool processLoopInVPlanNativePath(
92499255 VPlan &BestPlan = LVP.getPlanFor (VF.Width );
92509256
92519257 {
9252- GeneratedRTChecks Checks (PSE, DT, LI, TTI, CM. CostKind );
9258+ GeneratedRTChecks Checks (PSE, DT, LI, CM );
92539259 InnerLoopVectorizer LB (L, PSE, LI, DT, TTI, AC, VF.Width , /* UF=*/ 1 , &CM,
92549260 Checks, BestPlan);
92559261 LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
@@ -10085,7 +10091,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1008510091 if (ORE->allowExtraAnalysis (LV_NAME))
1008610092 LVP.emitInvalidCostRemarks (ORE);
1008710093
10088- GeneratedRTChecks Checks (PSE, DT, LI, TTI, CM. CostKind );
10094+ GeneratedRTChecks Checks (PSE, DT, LI, CM );
1008910095 if (LVP.hasPlanWithVF (VF.Width )) {
1009010096 // Select the interleave count.
1009110097 IC = LVP.selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
0 commit comments