llvm
diff --git a/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp‎
Lines changed: 76 additions & 68 deletions b/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp‎
Lines changed: 76 additions & 68 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll‎
Lines changed: 8 additions & 8 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll‎
Lines changed: 8 additions & 8 deletions
@@ -1766,6 +1766,64 @@ class LoopVectorizationCostModel {
 };
 } // end namespace llvm
 
+static std::optional<unsigned> getMaxVScale(const Function &F,
+                                            const TargetTransformInfo &TTI) {
+  if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
+    return MaxVScale;
+
+  if (F.hasFnAttribute(Attribute::VScaleRange))
+    return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
+
+  return std::nullopt;
+}
+
+/// For the given VF and UF and maximum trip count computed for the loop, return
+/// whether the induction variable might overflow in the vectorized loop. If
+/// not, then we know a runtime overflow check always evaluates to false and can
+/// be removed.
+static bool
+isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost,
+                                ElementCount VF,
+                                std::optional<unsigned> UF = std::nullopt) {
+  // Always be conservative if we don't know the exact unroll factor.
+  unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
+
+  IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
+  APInt MaxUIntTripCount = IdxTy->getMask();
+
+  // We know the runtime overflow check is known false iff the (max) trip-count
+  // is known and (max) trip-count + (VF * UF) does not overflow in the type of
+  // the vector loop induction variable.
+  if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
+    uint64_t MaxVF = VF.getKnownMinValue();
+    if (VF.isScalable()) {
+      std::optional<unsigned> MaxVScale =
+          getMaxVScale(*Cost->TheFunction, Cost->TTI);
+      if (!MaxVScale)
+        return false;
+      MaxVF *= *MaxVScale;
+    }
+
+    return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
+  }
+
+  return false;
+}
+
+/// Checks whether an IndVar overflow check is needed using
+/// isIndvarOverflowCheckKnownFalse, with additional information about the
+/// tail-folding style.
+static bool isIndvarOverflowCheckNeeded(const LoopVectorizationCostModel &CM,
+                                        ElementCount VF, unsigned IC) {
+  // vscale is not necessarily a power-of-2, which means we cannot guarantee
+  // an overflow to zero when updating induction variables and so an
+  // additional overflow check is required before entering the vector loop.
+  return VF.isScalable() && !CM.TTI.isVScaleKnownToBeAPowerOfTwo() &&
+         !isIndvarOverflowCheckKnownFalse(&CM, VF, IC) &&
+         CM.getTailFoldingStyle() !=
+             TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
+}
+
 namespace {
 /// Helper struct to manage generating runtime checks for vectorization.
 ///
@@ -1801,17 +1859,17 @@ class GeneratedRTChecks {
 
   PredicatedScalarEvolution &PSE;
 
-  /// The kind of cost that we are calculating
-  TTI::TargetCostKind CostKind;
+  /// The CostModel.
+  const LoopVectorizationCostModel &CM;
 
 public:
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
-                    const DataLayout &DL, TTI::TargetCostKind CostKind)
+                    const DataLayout &DL, LoopVectorizationCostModel &CM)
       : DT(DT), LI(LI), TTI(TTI),
         SCEVExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
         MemCheckExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
-        PSE(PSE), CostKind(CostKind) {}
+        PSE(PSE), CM(CM) {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
@@ -1833,11 +1891,18 @@ class GeneratedRTChecks {
     BasicBlock *LoopHeader = L->getHeader();
     BasicBlock *Preheader = L->getLoopPreheader();
 
+    // SCEVChecks are droppable when the UnionPred is always true, or when
+    // IndVar overflow checks are not needed, under the condition that we don't
+    // drop stride-versioning checks.
+    bool SCEVChecksAreDroppable =
+        UnionPred.isAlwaysTrue() || (!isIndvarOverflowCheckNeeded(CM, VF, IC) &&
+                                     LAI.getSymbolicStrides().empty());
+
     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
     // may be used by SCEVExpander. The blocks will be un-linked from their
     // predecessors and removed from LI & DT at the end of the function.
-    if (!UnionPred.isAlwaysTrue()) {
+    if (!SCEVChecksAreDroppable) {
       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
                                   nullptr, "vector.scevcheck");
 
@@ -1935,7 +2000,7 @@ class GeneratedRTChecks {
       for (Instruction &I : *SCEVCheckBlock) {
         if (SCEVCheckBlock->getTerminator() == &I)
           continue;
-        InstructionCost C = TTI->getInstructionCost(&I, CostKind);
+        InstructionCost C = TTI->getInstructionCost(&I, CM.CostKind);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
@@ -1944,7 +2009,7 @@ class GeneratedRTChecks {
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
-        InstructionCost C = TTI->getInstructionCost(&I, CostKind);
+        InstructionCost C = TTI->getInstructionCost(&I, CM.CostKind);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         MemCheckCost += C;
       }
@@ -2222,49 +2287,6 @@ emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
   llvm_unreachable("invalid enum");
 }
 
-static std::optional<unsigned> getMaxVScale(const Function &F,
-                                            const TargetTransformInfo &TTI) {
-  if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
-    return MaxVScale;
-
-  if (F.hasFnAttribute(Attribute::VScaleRange))
-    return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
-
-  return std::nullopt;
-}
-
-/// For the given VF and UF and maximum trip count computed for the loop, return
-/// whether the induction variable might overflow in the vectorized loop. If not,
-/// then we know a runtime overflow check always evaluates to false and can be
-/// removed.
-static bool isIndvarOverflowCheckKnownFalse(
-    const LoopVectorizationCostModel *Cost,
-    ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
-  // Always be conservative if we don't know the exact unroll factor.
-  unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
-
-  IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
-  APInt MaxUIntTripCount = IdxTy->getMask();
-
-  // We know the runtime overflow check is known false iff the (max) trip-count
-  // is known and (max) trip-count + (VF * UF) does not overflow in the type of
-  // the vector loop induction variable.
-  if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
-    uint64_t MaxVF = VF.getKnownMinValue();
-    if (VF.isScalable()) {
-      std::optional<unsigned> MaxVScale =
-          getMaxVScale(*Cost->TheFunction, Cost->TTI);
-      if (!MaxVScale)
-        return false;
-      MaxVF *= *MaxVScale;
-    }
-
-    return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
-  }
-
-  return false;
-}
-
 // Return whether we allow using masked interleave-groups (for dealing with
 // strided loads/stores that reside in predicated blocks, or for dealing
 // with gaps).
@@ -2354,13 +2376,7 @@ Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
       // check is known to be true, or known to be false.
       CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
     } // else step known to be < trip count, use CheckMinIters preset to false.
-  } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
-             !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
-             Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
-    // vscale is not necessarily a power-of-2, which means we cannot guarantee
-    // an overflow to zero when updating induction variables and so an
-    // additional overflow check is required before entering the vector loop.
-
+  } else if (isIndvarOverflowCheckNeeded(*Cost, VF, UF)) {
     // Get the maximum unsigned value for the type.
     Value *MaxUIntTripCount =
         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
@@ -9122,22 +9138,14 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
 void LoopVectorizationPlanner::addMinimumIterationCheck(
     VPlan &Plan, ElementCount VF, unsigned UF,
     ElementCount MinProfitableTripCount) const {
-  // vscale is not necessarily a power-of-2, which means we cannot guarantee
-  // an overflow to zero when updating induction variables and so an
-  // additional overflow check is required before entering the vector loop.
-  bool IsIndvarOverflowCheckNeededForVF =
-      VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
-      !isIndvarOverflowCheckKnownFalse(&CM, VF, UF) &&
-      CM.getTailFoldingStyle() !=
-          TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
   const uint32_t *BranchWeigths =
       hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
           ? &MinItersBypassWeights[0]
           : nullptr;
   VPlanTransforms::addMinimumIterationCheck(
       Plan, VF, UF, MinProfitableTripCount,
       CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
-      IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths,
+      isIndvarOverflowCheckNeeded(CM, VF, UF), OrigLoop, BranchWeigths,
       OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
       *PSE.getSE());
 }
@@ -9249,7 +9257,7 @@ static bool processLoopInVPlanNativePath(
   VPlan &BestPlan = LVP.getPlanFor(VF.Width);
 
   {
-    GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
+    GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM);
     InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
                            Checks, BestPlan);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10085,7 +10093,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (ORE->allowExtraAnalysis(LV_NAME))
     LVP.emitInvalidCostRemarks(ORE);
 
-  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
 
@@ -10,8 +10,8 @@
 
 
 ; CHECK-LABEL: test_sge
-; CHECK-LABEL: vector.scevcheck
-; CHECK-LABEL: vector.body
+; CHECK-NOT: vector.scevcheck
+; CHECK: vector.body
 define void @test_sge(ptr noalias %A,
                       ptr noalias %B,
                       ptr noalias %C, i32 %N) {
@@ -48,8 +48,8 @@ for.end:
 }
 
 ; CHECK-LABEL: test_uge
-; CHECK-LABEL: vector.scevcheck
-; CHECK-LABEL: vector.body
+; CHECK-NOT: vector.scevcheck
+; CHECK: vector.body
 define void @test_uge(ptr noalias %A,
                       ptr noalias %B,
                       ptr noalias %C, i32 %N, i32 %Offset) {
@@ -88,8 +88,8 @@ for.end:
 }
 
 ; CHECK-LABEL: test_ule
-; CHECK-LABEL: vector.scevcheck
-; CHECK-LABEL: vector.body
+; CHECK-NOT: vector.scevcheck
+; CHECK: vector.body
 define void @test_ule(ptr noalias %A,
                       ptr noalias %B,
                       ptr noalias %C, i32 %N,
@@ -127,8 +127,8 @@ for.end:
 }
 
 ; CHECK-LABEL: test_sle
-; CHECK-LABEL: vector.scevcheck
-; CHECK-LABEL: vector.body
+; CHECK-NOT: vector.scevcheck
+; CHECK: vector.body
 define void @test_sle(ptr noalias %A,
                    ptr noalias %B,
                    ptr noalias %C, i32 %N,