Skip to content

[VPlan] Materialize vector trip count using VPInstructions. #151925

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Aug 8, 2025
Merged
66 changes: 4 additions & 62 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -548,9 +548,6 @@ class InnerLoopVectorizer {
protected:
friend class LoopVectorizationPlanner;

/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);

// Create a check to see if the vector loop should be executed
Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;

Expand Down Expand Up @@ -2272,56 +2269,6 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
return TTI.enableMaskedInterleavedAccessVectorization();
}

Value *
InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
if (VectorTripCount)
return VectorTripCount;

Value *TC = getTripCount();
IRBuilder<> Builder(InsertBlock->getTerminator());

Type *Ty = TC->getType();
// This is where we can make the step a runtime constant.
Value *Step = createStepForVF(Builder, Ty, VF, UF);

// If the tail is to be folded by masking, round the number of iterations N
// up to a multiple of Step instead of rounding down. This is done by first
// adding Step-1 and then rounding down. Note that it's ok if this addition
// overflows: the vector induction variable will eventually wrap to zero given
// that it starts at zero and its Step is a power of two; the loop will then
// exit, with the last early-exit vector comparison also producing all-true.
// For scalable vectors the VF is not guaranteed to be a power of 2, but this
// is accounted for in emitIterationCountCheck that adds an overflow check.
if (Cost->foldTailByMasking()) {
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
"VF*UF must be a power of 2 when folding tail by masking");
TC = Builder.CreateAdd(TC, Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),
"n.rnd.up");
}

// Now we need to generate the expression for the part of the loop that the
// vectorized body will execute. This is equal to N - (N % Step) if scalar
// iterations are not required for correctness, or N - Step, otherwise. Step
// is equal to the vectorization factor (number of SIMD elements) times the
// unroll factor (number of SIMD instructions).
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");

// There are cases where we *must* run at least one iteration in the remainder
// loop. See the cost model for when this can happen. If the step evenly
// divides the trip count, we set the remainder to be equal to the step. If
// the step does not evenly divide the trip count, no adjustment is necessary
// since there will already be scalar iterations. Note that the minimum
// iterations check ensures that N >= Step.
if (Cost->requiresScalarEpilogue(VF.isVector())) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
}

VectorTripCount = Builder.CreateSub(TC, R, "n.vec");

return VectorTripCount;
}

void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
// Note: The block with the minimum trip-count check is already connected
// during earlier VPlan construction.
Expand Down Expand Up @@ -7354,6 +7301,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Canonicalize EVL loops after regions are dissolved.
VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
VPlanTransforms::materializeVectorTripCount(
BestVPlan, VectorPH, CM.foldTailByMasking(),
CM.requiresScalarEpilogue(BestVF.isVector()));

// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
Expand Down Expand Up @@ -7410,8 +7360,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
//===------------------------------------------------===//

// 2. Copy and widen instructions from the old loop into the new loop.
BestVPlan.prepareToExecute(
ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
BestVPlan.prepareToExecute(State);
replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);

// Move check blocks to their final position.
Expand Down Expand Up @@ -9407,13 +9356,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder, Index, getStartValue()->getLiveInIRValue(), Step, Kind,
cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName(Name);
// If index is the vector trip count, the concrete value will only be set in
// prepareToExecute, leading to missed simplifications, e.g. if it is 0.
// TODO: Remove the special case for the vector trip count once it is computed
// in VPlan and can be used during VPlan simplification.
assert((DerivedIV != Index ||
getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
"IV didn't need transforming?");
State.set(this, DerivedIV, VPLane(0));
}

Expand Down
10 changes: 2 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -951,15 +951,9 @@ VPlan::~VPlan() {
delete BackedgeTakenCount;
}

void VPlan::prepareToExecute(Value *VectorTripCountV, VPTransformState &State) {
if (!VectorTripCount.getUnderlyingValue())
VectorTripCount.setUnderlyingValue(VectorTripCountV);
else
assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV &&
"VectorTripCount set earlier must much VectorTripCountV");

void VPlan::prepareToExecute(VPTransformState &State) {
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
Type *TCTy = VectorTripCountV->getType();
Type *TCTy = VPTypeAnalysis(*this).inferScalarType(getTripCount());
// FIXME: Model VF * UF computation completely in VPlan.
unsigned UF = getUF();
if (VF.getNumUsers()) {
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -3969,7 +3969,7 @@ class VPlan {
}

/// Prepare the plan for execution, setting up the required live-in values.
void prepareToExecute(Value *VectorTripCount, VPTransformState &State);
void prepareToExecute(VPTransformState &State);

/// Generate the IR code for this VPlan.
void execute(VPTransformState *State);
Expand Down
61 changes: 61 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3278,6 +3278,67 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
BTC->replaceAllUsesWith(TCMO);
}

void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
VPBasicBlock *VectorPHVPBB,
bool TailByMasking,
bool RequiresScalarEpilogue) {
VPValue &VectorTC = Plan.getVectorTripCount();
assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
// There's nothing to do if there are no users of the vector trip count or its
// IR value has already been set.
if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
return;

VPValue *TC = Plan.getTripCount();
Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
VPValue *Step = &Plan.getVFxUF();

// If the tail is to be folded by masking, round the number of iterations N
// up to a multiple of Step instead of rounding down. This is done by first
// adding Step-1 and then rounding down. Note that it's ok if this addition
// overflows: the vector induction variable will eventually wrap to zero given
// that it starts at zero and its Step is a power of two; the loop will then
// exit, with the last early-exit vector comparison also producing all-true.
// For scalable vectors the VF is not guaranteed to be a power of 2, but this
// is accounted for in emitIterationCountCheck that adds an overflow check.
if (TailByMasking) {
TC = Builder.createNaryOp(
Instruction::Add,
{TC, Builder.createNaryOp(
Instruction::Sub,
{Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
DebugLoc::getCompilerGenerated(), "n.rnd.up");
}

// Now we need to generate the expression for the part of the loop that the
// vectorized body will execute. This is equal to N - (N % Step) if scalar
// iterations are not required for correctness, or N - Step, otherwise. Step
// is equal to the vectorization factor (number of SIMD elements) times the
// unroll factor (number of SIMD instructions).
VPValue *R =
Builder.createNaryOp(Instruction::URem, {TC, Step},
DebugLoc::getCompilerGenerated(), "n.mod.vf");

// There are cases where we *must* run at least one iteration in the remainder
// loop. See the cost model for when this can happen. If the step evenly
// divides the trip count, we set the remainder to be equal to the step. If
// the step does not evenly divide the trip count, no adjustment is necessary
// since there will already be scalar iterations. Note that the minimum
// iterations check ensures that N >= Step.
if (RequiresScalarEpilogue) {
assert(!TailByMasking &&
"requiring scalar epilogue is not supported with fail folding");
VPValue *IsZero = Builder.createICmp(
CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
R = Builder.createSelect(IsZero, Step, R);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't a problem with your patch, but isn't there some odd behaviour here when tail-folding a loop that requires a scalar epilogue? Suppose TC=(VF * UF) + 1, for the tail-folding case we add (VF * UF) - 1, which means that the remainder is 0. If requiring a scalar epilogue, we then set the remainder to be TC - (VF * VF), i.e. 1. Performance won't be great. :)

Perhaps we don't permit tail-folding when requiring a scalar epilogue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes they are mutually exclusive at the moment. Added an assert though, in case it changes in the future

}

VPValue *Res = Builder.createNaryOp(
Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
VectorTC.replaceAllUsesWith(Res);
}

/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,12 @@ struct VPlanTransforms {
unsigned BestUF,
PredicatedScalarEvolution &PSE);

/// Materialize vector trip count computations to a set of VPInstructions.
static void materializeVectorTripCount(VPlan &Plan,
VPBasicBlock *VectorPHVPBB,
bool TailByMasking,
bool RequiresScalarEpilogue);

/// Materialize the backedge-taken count to be computed explicitly using
/// VPInstructions.
static void materializeBackedgeTakenCount(VPlan &Plan,
Expand Down
20 changes: 4 additions & 16 deletions llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,13 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP6]]
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP1]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
Expand All @@ -34,7 +28,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-NEXT: [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
; CHECK-NEXT: [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[NEXT_GEP]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
Expand Down Expand Up @@ -92,19 +86,13 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP6]]
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP1]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
Expand All @@ -117,7 +105,7 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
; CHECK-NEXT: [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[NEXT_GEP]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -702,12 +702,6 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED: [[VECTOR_PH]]:
; PRED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
; PRED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 2
; PRED-NEXT: [[TMP8:%.*]] = sub i64 257, [[TMP7]]
Expand All @@ -726,7 +720,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[NEXT_GEP]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
Expand Down Expand Up @@ -1242,9 +1236,6 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
; PRED-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
; PRED-NEXT: br i1 [[TMP14]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; PRED: [[VECTOR_PH]]:
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; PRED-NEXT: [[TMP15:%.*]] = sub i64 [[TMP0]], 8
; PRED-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[TMP0]], 8
; PRED-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0
Expand Down
Loading