Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 77 additions & 71 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1766,6 +1766,64 @@ class LoopVectorizationCostModel {
};
} // end namespace llvm

static std::optional<unsigned> getMaxVScale(const Function &F,
const TargetTransformInfo &TTI) {
if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
return MaxVScale;

if (F.hasFnAttribute(Attribute::VScaleRange))
return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();

return std::nullopt;
}

/// For the given VF and UF and maximum trip count computed for the loop, return
/// whether the induction variable might overflow in the vectorized loop. If
/// not, then we know a runtime overflow check always evaluates to false and can
/// be removed.
static bool
isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost,
ElementCount VF,
std::optional<unsigned> UF = std::nullopt) {
// Always be conservative if we don't know the exact unroll factor.
unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);

IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
APInt MaxUIntTripCount = IdxTy->getMask();

// We know the runtime overflow check is known false iff the (max) trip-count
// is known and (max) trip-count + (VF * UF) does not overflow in the type of
// the vector loop induction variable.
if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
uint64_t MaxVF = VF.getKnownMinValue();
if (VF.isScalable()) {
std::optional<unsigned> MaxVScale =
getMaxVScale(*Cost->TheFunction, Cost->TTI);
if (!MaxVScale)
return false;
MaxVF *= *MaxVScale;
}

return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
}

return false;
}

/// Checks whether an IndVar overflow check is needed using
/// isIndvarOverflowCheckKnownFalse, with additional information about the
/// tail-folding style.
static bool isIndvarOverflowCheckNeeded(const LoopVectorizationCostModel &CM,
ElementCount VF, unsigned IC) {
// vscale is not necessarily a power-of-2, which means we cannot guarantee
// an overflow to zero when updating induction variables and so an
// additional overflow check is required before entering the vector loop.
return VF.isScalable() && !CM.TTI.isVScaleKnownToBeAPowerOfTwo() &&
!isIndvarOverflowCheckKnownFalse(&CM, VF, IC) &&
CM.getTailFoldingStyle() !=
TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
}

namespace {
/// Helper struct to manage generating runtime checks for vectorization.
///
Expand All @@ -1790,7 +1848,6 @@ class GeneratedRTChecks {

DominatorTree *DT;
LoopInfo *LI;
TargetTransformInfo *TTI;

SCEVExpander SCEVExp;
SCEVExpander MemCheckExp;
Expand All @@ -1801,17 +1858,16 @@ class GeneratedRTChecks {

PredicatedScalarEvolution &PSE;

/// The kind of cost that we are calculating
TTI::TargetCostKind CostKind;
/// The CostModel.
const LoopVectorizationCostModel &CM;

public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
TTI::TargetCostKind CostKind)
: DT(DT), LI(LI), TTI(TTI),
LoopInfo *LI, LoopVectorizationCostModel &CM)
: DT(DT), LI(LI),
SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
PSE(PSE), CostKind(CostKind) {}
PSE(PSE), CM(CM) {}

/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
Expand All @@ -1833,11 +1889,18 @@ class GeneratedRTChecks {
BasicBlock *LoopHeader = L->getHeader();
BasicBlock *Preheader = L->getLoopPreheader();

// SCEVChecks are droppable when the UnionPred is always true, or when
// IndVar overflow checks are not needed, under the condition that we don't
// drop stride-versioning checks.
bool SCEVChecksAreDroppable =
UnionPred.isAlwaysTrue() || (!isIndvarOverflowCheckNeeded(CM, VF, IC) &&
LAI.getSymbolicStrides().empty());

// Use SplitBlock to create blocks for SCEV & memory runtime checks to
// ensure the blocks are properly added to LoopInfo & DominatorTree. Those
// may be used by SCEVExpander. The blocks will be un-linked from their
// predecessors and removed from LI & DT at the end of the function.
if (!UnionPred.isAlwaysTrue()) {
if (!SCEVChecksAreDroppable) {
SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
nullptr, "vector.scevcheck");

Expand Down Expand Up @@ -1935,7 +1998,7 @@ class GeneratedRTChecks {
for (Instruction &I : *SCEVCheckBlock) {
if (SCEVCheckBlock->getTerminator() == &I)
continue;
InstructionCost C = TTI->getInstructionCost(&I, CostKind);
InstructionCost C = CM.TTI.getInstructionCost(&I, CM.CostKind);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
RTCheckCost += C;
}
Expand All @@ -1944,7 +2007,7 @@ class GeneratedRTChecks {
for (Instruction &I : *MemCheckBlock) {
if (MemCheckBlock->getTerminator() == &I)
continue;
InstructionCost C = TTI->getInstructionCost(&I, CostKind);
InstructionCost C = CM.TTI.getInstructionCost(&I, CM.CostKind);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
MemCheckCost += C;
}
Expand Down Expand Up @@ -2222,49 +2285,6 @@ emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
llvm_unreachable("invalid enum");
}

static std::optional<unsigned> getMaxVScale(const Function &F,
const TargetTransformInfo &TTI) {
if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
return MaxVScale;

if (F.hasFnAttribute(Attribute::VScaleRange))
return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();

return std::nullopt;
}

/// For the given VF and UF and maximum trip count computed for the loop, return
/// whether the induction variable might overflow in the vectorized loop. If not,
/// then we know a runtime overflow check always evaluates to false and can be
/// removed.
static bool isIndvarOverflowCheckKnownFalse(
const LoopVectorizationCostModel *Cost,
ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
// Always be conservative if we don't know the exact unroll factor.
unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);

IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
APInt MaxUIntTripCount = IdxTy->getMask();

// We know the runtime overflow check is known false iff the (max) trip-count
// is known and (max) trip-count + (VF * UF) does not overflow in the type of
// the vector loop induction variable.
if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
uint64_t MaxVF = VF.getKnownMinValue();
if (VF.isScalable()) {
std::optional<unsigned> MaxVScale =
getMaxVScale(*Cost->TheFunction, Cost->TTI);
if (!MaxVScale)
return false;
MaxVF *= *MaxVScale;
}

return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
}

return false;
}

// Return whether we allow using masked interleave-groups (for dealing with
// strided loads/stores that reside in predicated blocks, or for dealing
// with gaps).
Expand Down Expand Up @@ -2354,13 +2374,7 @@ Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
// check is known to be true, or known to be false.
CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
} // else step known to be < trip count, use CheckMinIters preset to false.
} else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
!isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
// vscale is not necessarily a power-of-2, which means we cannot guarantee
// an overflow to zero when updating induction variables and so an
// additional overflow check is required before entering the vector loop.

} else if (isIndvarOverflowCheckNeeded(*Cost, VF, UF)) {
// Get the maximum unsigned value for the type.
Value *MaxUIntTripCount =
ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
Expand Down Expand Up @@ -9122,22 +9136,14 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
void LoopVectorizationPlanner::addMinimumIterationCheck(
VPlan &Plan, ElementCount VF, unsigned UF,
ElementCount MinProfitableTripCount) const {
// vscale is not necessarily a power-of-2, which means we cannot guarantee
// an overflow to zero when updating induction variables and so an
// additional overflow check is required before entering the vector loop.
bool IsIndvarOverflowCheckNeededForVF =
VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
!isIndvarOverflowCheckKnownFalse(&CM, VF, UF) &&
CM.getTailFoldingStyle() !=
TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
const uint32_t *BranchWeigths =
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())
? &MinItersBypassWeights[0]
: nullptr;
VPlanTransforms::addMinimumIterationCheck(
Plan, VF, UF, MinProfitableTripCount,
CM.requiresScalarEpilogue(VF.isVector()), CM.foldTailByMasking(),
IsIndvarOverflowCheckNeededForVF, OrigLoop, BranchWeigths,
isIndvarOverflowCheckNeeded(CM, VF, UF), OrigLoop, BranchWeigths,
OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
*PSE.getSE());
}
Expand Down Expand Up @@ -9249,7 +9255,7 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getPlanFor(VF.Width);

{
GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
GeneratedRTChecks Checks(PSE, DT, LI, CM);
InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
Expand Down Expand Up @@ -10085,7 +10091,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);

GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
GeneratedRTChecks Checks(PSE, DT, LI, CM);
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@


; CHECK-LABEL: test_sge
; CHECK-LABEL: vector.scevcheck
; CHECK-LABEL: vector.body
; CHECK-NOT: vector.scevcheck
; CHECK: vector.body
define void @test_sge(ptr noalias %A,
ptr noalias %B,
ptr noalias %C, i32 %N) {
Expand Down Expand Up @@ -48,8 +48,8 @@ for.end:
}

; CHECK-LABEL: test_uge
; CHECK-LABEL: vector.scevcheck
; CHECK-LABEL: vector.body
; CHECK-NOT: vector.scevcheck
; CHECK: vector.body
define void @test_uge(ptr noalias %A,
ptr noalias %B,
ptr noalias %C, i32 %N, i32 %Offset) {
Expand Down Expand Up @@ -88,8 +88,8 @@ for.end:
}

; CHECK-LABEL: test_ule
; CHECK-LABEL: vector.scevcheck
; CHECK-LABEL: vector.body
; CHECK-NOT: vector.scevcheck
; CHECK: vector.body
define void @test_ule(ptr noalias %A,
ptr noalias %B,
ptr noalias %C, i32 %N,
Expand Down Expand Up @@ -127,8 +127,8 @@ for.end:
}

; CHECK-LABEL: test_sle
; CHECK-LABEL: vector.scevcheck
; CHECK-LABEL: vector.body
; CHECK-NOT: vector.scevcheck
; CHECK: vector.body
define void @test_sle(ptr noalias %A,
ptr noalias %B,
ptr noalias %C, i32 %N,
Expand Down
Loading