Skip to content

Commit fa3ec0c

Browse files
authored
[VPlan] Materialize constant vector trip counts before final opts. (#142309)
Materialize constant vector trip counts before ::execute, if the trip count can be computed as Original (TC / (VF * UF)) * (VF * UF). For now this excludes when the tail is folded or scalar epilogues are required. This enables removing a number of redundant branches from the middle block. For now this is also only done when not vectorizing the epilogue, as the simplification complicates stitching the 2 plans together. PR: #142309
1 parent 29992cf commit fa3ec0c

File tree

191 files changed

+1632
-1670
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

191 files changed

+1632
-1670
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7279,6 +7279,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72797279
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
72807280
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
72817281
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
7282+
VPlanTransforms::removeBranchOnConst(BestVPlan);
72827283
VPlanTransforms::narrowInterleaveGroups(
72837284
BestVPlan, BestVF,
72847285
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
@@ -10242,6 +10243,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1024210243
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
1024310244
ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
1024410245

10246+
// TODO: Move to general VPlan pipeline once epilogue loops are also
10247+
// supported.
10248+
VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount,
10249+
BestPlan, VF.Width, IC, PSE);
10250+
1024510251
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
1024610252

1024710253
ORE->emit([&]() {
@@ -10309,6 +10315,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1030910315
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
1031010316
VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
1031110317
Checks, BestPlan);
10318+
// TODO: Move to general VPlan pipeline once epilogue loops are also
10319+
// supported.
10320+
VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount,
10321+
BestPlan, VF.Width, IC, PSE);
10322+
1031210323
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
1031310324
++LoopsVectorized;
1031410325

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -962,7 +962,11 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
962962
BackedgeTakenCount->setUnderlyingValue(TCMO);
963963
}
964964

965-
VectorTripCount.setUnderlyingValue(VectorTripCountV);
965+
if (!VectorTripCount.getUnderlyingValue())
966+
VectorTripCount.setUnderlyingValue(VectorTripCountV);
967+
else
968+
assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV &&
969+
"VectorTripCount set earlier must much VectorTripCountV");
966970

967971
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
968972
// FIXME: Model VF * UF computation completely in VPlan.

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1883,9 +1883,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
18831883
}
18841884
}
18851885

1886-
/// Remove BranchOnCond recipes with true or false conditions together with
1887-
/// removing dead edges to their successors.
1888-
static void removeBranchOnConst(VPlan &Plan) {
1886+
void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {
18891887
using namespace llvm::VPlanPatternMatch;
18901888
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
18911889
vp_depth_first_shallow(Plan.getEntry()))) {
@@ -1908,12 +1906,9 @@ static void removeBranchOnConst(VPlan &Plan) {
19081906
"There must be a single edge between VPBB and its successor");
19091907
// Values coming from VPBB into phi recipes of RemoveSucc are removed from
19101908
// these recipes.
1911-
for (VPRecipeBase &R : RemovedSucc->phis()) {
1912-
auto *Phi = cast<VPPhiAccessors>(&R);
1913-
assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) &&
1914-
"VPIRPhis must have a single predecessor");
1915-
Phi->removeIncomingValueFor(VPBB);
1916-
}
1909+
for (VPRecipeBase &R : RemovedSucc->phis())
1910+
cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
1911+
19171912
// Disconnect blocks and remove the terminator. RemovedSucc will be deleted
19181913
// automatically on VPlan destruction if it becomes unreachable.
19191914
VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
@@ -3093,6 +3088,29 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
30933088
}
30943089
}
30953090

3091+
void VPlanTransforms::materializeVectorTripCount(
3092+
VPlan &Plan, ElementCount BestVF, unsigned BestUF,
3093+
PredicatedScalarEvolution &PSE) {
3094+
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
3095+
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
3096+
3097+
VPValue *TC = Plan.getTripCount();
3098+
// Skip cases for which the trip count may be non-trivial to materialize.
3099+
if (!Plan.hasScalarTail() ||
3100+
Plan.getMiddleBlock()->getSingleSuccessor() ==
3101+
Plan.getScalarPreheader() ||
3102+
!TC->isLiveIn())
3103+
return;
3104+
// Materialize vector trip counts for constants early if it can simply
3105+
// be computed as (Original TC / VF * UF) * VF * UF.
3106+
ScalarEvolution &SE = *PSE.getSE();
3107+
auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
3108+
const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
3109+
auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
3110+
if (auto *NewC = dyn_cast<SCEVConstant>(VecTCScev))
3111+
Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue());
3112+
}
3113+
30963114
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
30973115
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
30983116
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,10 @@ struct VPlanTransforms {
224224
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
225225
static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy);
226226

227+
/// Remove BranchOnCond recipes with true or false conditions together with
228+
/// removing dead edges to their successors.
229+
static void removeBranchOnConst(VPlan &Plan);
230+
227231
/// If there's a single exit block, optimize its phi recipes that use exiting
228232
/// IV values by feeding them precomputed end values instead, possibly taken
229233
/// one step backwards.
@@ -234,6 +238,12 @@ struct VPlanTransforms {
234238
/// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
235239
static void materializeBroadcasts(VPlan &Plan);
236240

241+
// Materialize vector trip counts for constants early if it can simply be
242+
// computed as (Original TC / VF * UF) * VF * UF.
243+
static void materializeVectorTripCount(VPlan &Plan, ElementCount BestVF,
244+
unsigned BestUF,
245+
PredicatedScalarEvolution &PSE);
246+
237247
/// Try to convert a plan with interleave groups with VF elements to a plan
238248
/// with the interleave groups replaced by wide loads and stores processing VF
239249
/// elements, if all transformed interleave groups access the full vector

llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
368368
; CHECK-NEXT: [[TMP71:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
369369
; CHECK-NEXT: br i1 [[TMP71]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
370370
; CHECK: [[MIDDLE_BLOCK]]:
371-
; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
371+
; CHECK-NEXT: br label %[[SCALAR_PH]]
372372
; CHECK: [[SCALAR_PH]]:
373373
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
374374
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
@@ -388,7 +388,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
388388
; CHECK: [[LOOP_LATCH]]:
389389
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV1]], 1
390390
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 100
391-
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
391+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
392392
; CHECK: [[EXIT]]:
393393
; CHECK-NEXT: ret void
394394
;

llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ define void @fshl_operand_first_order_recurrence(ptr %dst, ptr noalias %src) {
3232
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
3333
; CHECK: [[MIDDLE_BLOCK]]:
3434
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[WIDE_LOAD1]], i32 1
35-
; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
35+
; CHECK-NEXT: br label %[[SCALAR_PH]]
3636
; CHECK: [[SCALAR_PH]]:
3737
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
3838
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -47,7 +47,7 @@ define void @fshl_operand_first_order_recurrence(ptr %dst, ptr noalias %src) {
4747
; CHECK-NEXT: store i64 [[OR]], ptr [[GEP_DST]], align 8
4848
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
4949
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 100
50-
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
50+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
5151
; CHECK: [[EXIT]]:
5252
; CHECK-NEXT: ret void
5353
;
@@ -86,9 +86,9 @@ define void @powi_call(ptr %P) {
8686
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP4]], align 8
8787
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
8888
; CHECK: [[MIDDLE_BLOCK]]:
89-
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
89+
; CHECK-NEXT: br label %[[EXIT:.*]]
9090
; CHECK: [[SCALAR_PH]]:
91-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
91+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
9292
; CHECK-NEXT: br label %[[LOOP:.*]]
9393
; CHECK: [[LOOP]]:
9494
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]

llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ define void @_Z3foov(i64 %n) {
4040
; CHECK-V2-IC1: [[SCALAR_PH]]:
4141
; CHECK-V2-IC1: br label %[[FOR_BODY:.*]]
4242
; CHECK-V2-IC1: [[FOR_BODY]]:
43-
; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
43+
; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
4444
; CHECK-V2-IC1: [[FOR_COND_CLEANUP]]:
4545
;
4646
; CHECK-V2-IC4-LABEL: define void @_Z3foov(
@@ -101,9 +101,8 @@ for.cond.cleanup: ; preds = %for.body
101101
; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
102102
; CHECK-V2-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
103103
; CHECK-V2-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
104-
; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
105-
; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
106-
; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
104+
; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
105+
; CHECK-V2-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]], [[META3]]}
107106
;.
108107
; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
109108
; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -680,7 +680,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
680680
; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
681681
; DEFAULT-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
682682
; DEFAULT: [[MIDDLE_BLOCK]]:
683-
; DEFAULT-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
683+
; DEFAULT-NEXT: br label %[[SCALAR_PH]]
684684
; DEFAULT: [[SCALAR_PH]]:
685685
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[DST]], %[[ENTRY]] ]
686686
; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -696,7 +696,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
696696
; DEFAULT-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8
697697
; DEFAULT-NEXT: [[IV_CLAMP:%.*]] = and i64 [[IV]], 4294967294
698698
; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_CLAMP]], 512
699-
; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
699+
; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
700700
; DEFAULT: [[EXIT]]:
701701
; DEFAULT-NEXT: ret void
702702
;
@@ -1492,7 +1492,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
14921492
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
14931493
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
14941494
; DEFAULT: [[MIDDLE_BLOCK]]:
1495-
; DEFAULT-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
1495+
; DEFAULT-NEXT: br label %[[SCALAR_PH]]
14961496
; DEFAULT: [[SCALAR_PH]]:
14971497
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
14981498
; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
@@ -1506,7 +1506,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
15061506
; DEFAULT-NEXT: [[T:%.*]] = trunc nuw nsw i64 [[IV_NEXT]] to i32
15071507
; DEFAULT-NEXT: store i32 [[T]], ptr [[DST]], align 4
15081508
; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 21
1509-
; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
1509+
; DEFAULT-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
15101510
; DEFAULT: [[EXIT]]:
15111511
; DEFAULT-NEXT: ret void
15121512
;

llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ define void @old_and_new_size_equalko(ptr noalias %src, ptr noalias %dst) {
469469
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
470470
; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
471471
; CHECK: middle.block:
472-
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
472+
; CHECK-NEXT: br label [[EXIT:%.*]]
473473
; CHECK: scalar.ph:
474474
; CHECK-NEXT: br label [[LOOP:%.*]]
475475
; CHECK: loop:

llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
7272
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
7373
; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
7474
; CHECK: [[MIDDLE_BLOCK]]:
75-
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
75+
; CHECK-NEXT: br label %[[EXIT:.*]]
7676
; CHECK: [[SCALAR_PH]]:
77-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
77+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
7878
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
7979
; CHECK: [[LOOP_HEADER]]:
8080
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]

0 commit comments

Comments
 (0)