@@ -1766,6 +1766,64 @@ class LoopVectorizationCostModel {
17661766};
17671767} // end namespace llvm
17681768
1769+ static std::optional<unsigned > getMaxVScale (const Function &F,
1770+ const TargetTransformInfo &TTI) {
1771+ if (std::optional<unsigned > MaxVScale = TTI.getMaxVScale ())
1772+ return MaxVScale;
1773+
1774+ if (F.hasFnAttribute (Attribute::VScaleRange))
1775+ return F.getFnAttribute (Attribute::VScaleRange).getVScaleRangeMax ();
1776+
1777+ return std::nullopt ;
1778+ }
1779+
1780+ // / For the given VF and UF and maximum trip count computed for the loop, return
1781+ // / whether the induction variable might overflow in the vectorized loop. If
1782+ // / not, then we know a runtime overflow check always evaluates to false and can
1783+ // / be removed.
1784+ static bool
1785+ isIndvarOverflowCheckKnownFalse (const LoopVectorizationCostModel *Cost,
1786+ ElementCount VF,
1787+ std::optional<unsigned > UF = std::nullopt ) {
1788+ // Always be conservative if we don't know the exact unroll factor.
1789+ unsigned MaxUF = UF ? *UF : Cost->TTI .getMaxInterleaveFactor (VF);
1790+
1791+ IntegerType *IdxTy = Cost->Legal ->getWidestInductionType ();
1792+ APInt MaxUIntTripCount = IdxTy->getMask ();
1793+
1794+ // We know the runtime overflow check is known false iff the (max) trip-count
1795+ // is known and (max) trip-count + (VF * UF) does not overflow in the type of
1796+ // the vector loop induction variable.
1797+ if (unsigned TC = Cost->PSE .getSmallConstantMaxTripCount ()) {
1798+ uint64_t MaxVF = VF.getKnownMinValue ();
1799+ if (VF.isScalable ()) {
1800+ std::optional<unsigned > MaxVScale =
1801+ getMaxVScale (*Cost->TheFunction , Cost->TTI );
1802+ if (!MaxVScale)
1803+ return false ;
1804+ MaxVF *= *MaxVScale;
1805+ }
1806+
1807+ return (MaxUIntTripCount - TC).ugt (MaxVF * MaxUF);
1808+ }
1809+
1810+ return false ;
1811+ }
1812+
1813+ // / Checks whether an IndVar overflow check is needed using
1814+ // / isIndvarOverflowCheckKnownFalse, with additional information about the
1815+ // / tail-folding style.
1816+ static bool isIndvarOverflowCheckNeeded (const LoopVectorizationCostModel &CM,
1817+ ElementCount VF, unsigned IC) {
1818+ // vscale is not necessarily a power-of-2, which means we cannot guarantee
1819+ // an overflow to zero when updating induction variables and so an
1820+ // additional overflow check is required before entering the vector loop.
1821+ return VF.isScalable () && !CM.TTI .isVScaleKnownToBeAPowerOfTwo () &&
1822+ !isIndvarOverflowCheckKnownFalse (&CM, VF, IC) &&
1823+ CM.getTailFoldingStyle () !=
1824+ TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
1825+ }
1826+
17691827namespace {
17701828// / Helper struct to manage generating runtime checks for vectorization.
17711829// /
@@ -1801,17 +1859,17 @@ class GeneratedRTChecks {
18011859
18021860 PredicatedScalarEvolution &PSE;
18031861
1804- // / The kind of cost that we are calculating
1805- TTI::TargetCostKind CostKind ;
1862+ // / The CostModel.
1863+ const LoopVectorizationCostModel &CM ;
18061864
18071865public:
18081866 GeneratedRTChecks (PredicatedScalarEvolution &PSE, DominatorTree *DT,
18091867 LoopInfo *LI, TargetTransformInfo *TTI,
1810- const DataLayout &DL, TTI::TargetCostKind CostKind )
1868+ const DataLayout &DL, LoopVectorizationCostModel &CM )
18111869 : DT(DT), LI(LI), TTI(TTI),
18121870 SCEVExp (*PSE.getSE(), DL, "scev.check", /* PreserveLCSSA=*/ false),
18131871 MemCheckExp(*PSE.getSE(), DL, "scev.check", /* PreserveLCSSA=*/ false),
1814- PSE(PSE), CostKind(CostKind ) {}
1872+ PSE(PSE), CM(CM ) {}
18151873
18161874 // / Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
18171875 // / accurately estimate the cost of the runtime checks. The blocks are
@@ -1833,11 +1891,18 @@ class GeneratedRTChecks {
18331891 BasicBlock *LoopHeader = L->getHeader ();
18341892 BasicBlock *Preheader = L->getLoopPreheader ();
18351893
1894+ // SCEVChecks are droppable when the UnionPred is always true, or when
1895+ // IndVar overflow checks are not needed, under the condition that we don't
1896+ // drop stride-versioning checks.
1897+ bool SCEVChecksAreDroppable =
1898+ UnionPred.isAlwaysTrue () || (!isIndvarOverflowCheckNeeded (CM, VF, IC) &&
1899+ LAI.getSymbolicStrides ().empty ());
1900+
18361901 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
18371902 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
18381903 // may be used by SCEVExpander. The blocks will be un-linked from their
18391904 // predecessors and removed from LI & DT at the end of the function.
1840- if (!UnionPred. isAlwaysTrue () ) {
1905+ if (!SCEVChecksAreDroppable ) {
18411906 SCEVCheckBlock = SplitBlock (Preheader, Preheader->getTerminator (), DT, LI,
18421907 nullptr , " vector.scevcheck" );
18431908
@@ -1935,7 +2000,7 @@ class GeneratedRTChecks {
19352000 for (Instruction &I : *SCEVCheckBlock) {
19362001 if (SCEVCheckBlock->getTerminator () == &I)
19372002 continue ;
1938- InstructionCost C = TTI->getInstructionCost (&I, CostKind);
2003+ InstructionCost C = TTI->getInstructionCost (&I, CM. CostKind );
19392004 LLVM_DEBUG (dbgs () << " " << C << " for " << I << " \n " );
19402005 RTCheckCost += C;
19412006 }
@@ -1944,7 +2009,7 @@ class GeneratedRTChecks {
19442009 for (Instruction &I : *MemCheckBlock) {
19452010 if (MemCheckBlock->getTerminator () == &I)
19462011 continue ;
1947- InstructionCost C = TTI->getInstructionCost (&I, CostKind);
2012+ InstructionCost C = TTI->getInstructionCost (&I, CM. CostKind );
19482013 LLVM_DEBUG (dbgs () << " " << C << " for " << I << " \n " );
19492014 MemCheckCost += C;
19502015 }
@@ -2222,49 +2287,6 @@ emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
22222287 llvm_unreachable (" invalid enum" );
22232288}
22242289
2225- static std::optional<unsigned > getMaxVScale (const Function &F,
2226- const TargetTransformInfo &TTI) {
2227- if (std::optional<unsigned > MaxVScale = TTI.getMaxVScale ())
2228- return MaxVScale;
2229-
2230- if (F.hasFnAttribute (Attribute::VScaleRange))
2231- return F.getFnAttribute (Attribute::VScaleRange).getVScaleRangeMax ();
2232-
2233- return std::nullopt ;
2234- }
2235-
2236- // / For the given VF and UF and maximum trip count computed for the loop, return
2237- // / whether the induction variable might overflow in the vectorized loop. If not,
2238- // / then we know a runtime overflow check always evaluates to false and can be
2239- // / removed.
2240- static bool isIndvarOverflowCheckKnownFalse (
2241- const LoopVectorizationCostModel *Cost,
2242- ElementCount VF, std::optional<unsigned > UF = std::nullopt ) {
2243- // Always be conservative if we don't know the exact unroll factor.
2244- unsigned MaxUF = UF ? *UF : Cost->TTI .getMaxInterleaveFactor (VF);
2245-
2246- IntegerType *IdxTy = Cost->Legal ->getWidestInductionType ();
2247- APInt MaxUIntTripCount = IdxTy->getMask ();
2248-
2249- // We know the runtime overflow check is known false iff the (max) trip-count
2250- // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2251- // the vector loop induction variable.
2252- if (unsigned TC = Cost->PSE .getSmallConstantMaxTripCount ()) {
2253- uint64_t MaxVF = VF.getKnownMinValue ();
2254- if (VF.isScalable ()) {
2255- std::optional<unsigned > MaxVScale =
2256- getMaxVScale (*Cost->TheFunction , Cost->TTI );
2257- if (!MaxVScale)
2258- return false ;
2259- MaxVF *= *MaxVScale;
2260- }
2261-
2262- return (MaxUIntTripCount - TC).ugt (MaxVF * MaxUF);
2263- }
2264-
2265- return false ;
2266- }
2267-
22682290// Return whether we allow using masked interleave-groups (for dealing with
22692291// strided loads/stores that reside in predicated blocks, or for dealing
22702292// with gaps).
@@ -2354,13 +2376,7 @@ Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
23542376 // check is known to be true, or known to be false.
23552377 CheckMinIters = Builder.CreateICmp (P, Count, Step, " min.iters.check" );
23562378 } // else step known to be < trip count, use CheckMinIters preset to false.
2357- } else if (VF.isScalable () && !TTI->isVScaleKnownToBeAPowerOfTwo () &&
2358- !isIndvarOverflowCheckKnownFalse (Cost, VF, UF) &&
2359- Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2360- // vscale is not necessarily a power-of-2, which means we cannot guarantee
2361- // an overflow to zero when updating induction variables and so an
2362- // additional overflow check is required before entering the vector loop.
2363-
2379+ } else if (isIndvarOverflowCheckNeeded (*Cost, VF, UF)) {
23642380 // Get the maximum unsigned value for the type.
23652381 Value *MaxUIntTripCount =
23662382 ConstantInt::get (CountTy, cast<IntegerType>(CountTy)->getMask ());
@@ -9122,22 +9138,14 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
91229138void LoopVectorizationPlanner::addMinimumIterationCheck (
91239139 VPlan &Plan, ElementCount VF, unsigned UF,
91249140 ElementCount MinProfitableTripCount) const {
9125- // vscale is not necessarily a power-of-2, which means we cannot guarantee
9126- // an overflow to zero when updating induction variables and so an
9127- // additional overflow check is required before entering the vector loop.
9128- bool IsIndvarOverflowCheckNeededForVF =
9129- VF.isScalable () && !TTI.isVScaleKnownToBeAPowerOfTwo () &&
9130- !isIndvarOverflowCheckKnownFalse (&CM, VF, UF) &&
9131- CM.getTailFoldingStyle () !=
9132- TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
91339141 const uint32_t *BranchWeigths =
91349142 hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ())
91359143 ? &MinItersBypassWeights[0 ]
91369144 : nullptr ;
91379145 VPlanTransforms::addMinimumIterationCheck (
91389146 Plan, VF, UF, MinProfitableTripCount,
91399147 CM.requiresScalarEpilogue (VF.isVector ()), CM.foldTailByMasking (),
9140- IsIndvarOverflowCheckNeededForVF , OrigLoop, BranchWeigths,
9148+ isIndvarOverflowCheckNeeded (CM, VF, UF) , OrigLoop, BranchWeigths,
91419149 OrigLoop->getLoopPredecessor ()->getTerminator ()->getDebugLoc (),
91429150 *PSE.getSE ());
91439151}
@@ -9249,7 +9257,7 @@ static bool processLoopInVPlanNativePath(
92499257 VPlan &BestPlan = LVP.getPlanFor (VF.Width );
92509258
92519259 {
9252- GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM. CostKind );
9260+ GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM);
92539261 InnerLoopVectorizer LB (L, PSE, LI, DT, TTI, AC, VF.Width , /* UF=*/ 1 , &CM,
92549262 Checks, BestPlan);
92559263 LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
@@ -10085,7 +10093,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1008510093 if (ORE->allowExtraAnalysis (LV_NAME))
1008610094 LVP.emitInvalidCostRemarks (ORE);
1008710095
10088- GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM. CostKind );
10096+ GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM);
1008910097 if (LVP.hasPlanWithVF (VF.Width )) {
1009010098 // Select the interleave count.
1009110099 IC = LVP.selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
0 commit comments