From 918d6d39180a965286017540cddd18af280265fc Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Wed, 23 Jul 2025 17:02:40 -0700 Subject: [PATCH 1/3] [VPlan] Get Addr computation cost with scalar type if it is uniform for gather/scatter. This patch query `getAddressComputationCost()` with scalar type if the address is uniform. This can help the cost for gather/scatter more accurate. In current LV, non consecutive VPWidenMemoryRecipe (gather/scatter) will account the cost of address computation. But there are some cases that the addr is uniform accross lanes, that makes the address can be calculated with scalar type and broadcast. I have a follow optimization that try to converts gather/scatter with uniform memory acces to scalar load/store + broadcast. With this optimization, we can remove this temporary change. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++++ llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 ++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index be00fd6a416e5..dbc0f30cbbfaf 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7018,6 +7018,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &R : *VPBB) { + if (auto *MR = dyn_cast(&R)) { + // The address computation cost can be query as scalar type if the + // address is uniform. + if (!MR->isConsecutive() && vputils::isSingleScalar(MR->getAddr())) + return true; + } if (auto *IR = dyn_cast(&R)) { auto *IG = IR->getInterleaveGroup(); unsigned NumMembers = IG->getNumMembers(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index e971ba1aac15c..ab11bfe560f6e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3099,9 +3099,18 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, const Value *Ptr = getLoadStorePointerOperand(&Ingredient); assert(!Reverse && "Inconsecutive memory access should not have the order."); - return Ctx.TTI.getAddressComputationCost(Ty) + - Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, - Ctx.CostKind, &Ingredient); + InstructionCost Cost = 0; + + // If the address value is uniform across all lane, then the address can be + // calculated with scalar type and broacast. + if (vputils::isSingleScalar(getAddr())) + Cost += Ctx.TTI.getAddressComputationCost(Ty->getScalarType()); + else + Cost += Ctx.TTI.getAddressComputationCost(Ty); + + return Cost + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, + Alignment, Ctx.CostKind, + &Ingredient); } InstructionCost Cost = 0; From effc9f82727374b512a605c270017024f90efe72 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Thu, 7 Aug 2025 23:04:03 -0700 Subject: [PATCH 2/3] !fixup, remove unnecessary planContainsAdditionalSimplifications. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index dbc0f30cbbfaf..be00fd6a416e5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7018,12 +7018,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &R : *VPBB) { - if (auto *MR = dyn_cast(&R)) { - // The address computation cost can be query as scalar type if the - // address is uniform. - if (!MR->isConsecutive() && vputils::isSingleScalar(MR->getAddr())) - return true; - } if (auto *IR = dyn_cast(&R)) { auto *IG = IR->getInterleaveGroup(); unsigned NumMembers = IG->getNumMembers(); From f37a84c198191b7ecb42fcbd21074b735a26ae92 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Tue, 22 Jul 2025 20:48:37 -0700 Subject: [PATCH 3/3] [RISCV][TTI] Implement `getAddressComputationCost()` in RISCV TTI. This patch impelemt the `getAddressComputationCost()` in RISCV TTI which make the gather/scatter with address calculation more expansive that stride cost. Note that the only user of `getAddressComputationCost()` with vector type is in `VPWidenMemoryRecipe::computeCost()`. So this patch make some LV tests changes. I've checked the tests changes in LV and seems those changes can be divided into two groups. * gather/scatter with uniform vector ptr, seems can be optimized to masked.load. * can optimize to stirde load/store. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 13 +++ .../Target/RISCV/RISCVTargetTransformInfo.h | 4 + .../LoopVectorize/RISCV/dead-ops-cost.ll | 27 +++-- .../Transforms/LoopVectorize/RISCV/pr88802.ll | 26 ++--- .../RISCV/tail-folding-gather-scatter.ll | 99 +++---------------- 5 files changed, 55 insertions(+), 114 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 67f924aadc8c0..3cab430d62db2 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1566,6 +1566,19 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return BaseT::getIntrinsicInstrCost(ICA, CostKind); } +InstructionCost RISCVTTIImpl::getAddressComputationCost(Type *Ty, + ScalarEvolution *SE, + const SCEV *Ptr) const { + // Address computations with vector type are usually for indexed load/store + // which is likely more expensive. + if (ST->hasVInstructions() && Ty->isVectorTy()) + return getArithmeticInstrCost( + Instruction::Add, Ty, TTI::TCK_RecipThroughput, + {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, {}); + + return BaseT::getAddressComputationCost(Ty, SE, Ptr); +} + InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 6a1f4b3e3bedf..dbf421dbae61d 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -177,6 +177,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase { getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override; + InstructionCost + getAddressComputationCost(Type *Ty, ScalarEvolution *SE = nullptr, + const SCEV *Ptr = nullptr) const override; + InstructionCost getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index 18757a7a02e17..ed87d2d916b39 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -93,8 +93,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i32 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 6, i32 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i32 [[TMP0]], 8 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: @@ -106,29 +105,29 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 252, [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 252, [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 8 ; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP11:%.*]] = mul [[TMP9]], splat (i32 4) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv8i32() +; CHECK-NEXT: [[TMP11:%.*]] = mul [[TMP10]], splat (i32 4) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] ; CHECK-NEXT: [[TMP14:%.*]] = mul i32 4, [[TMP8]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP14]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP14]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP15:%.*]] = sext [[VEC_IND]] to -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], [[TMP15]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0( zeroinitializer, [[TMP16]], i32 1, splat (i1 true)), !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = sext [[VEC_IND]] to +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], [[TMP12]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv8i8.nxv8p0( zeroinitializer, [[TMP13]], i32 1, splat (i1 true)), !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll index d41d47a6448be..b646c2259ada5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll @@ -34,22 +34,22 @@ define void @test(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 9, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[FOR_COND]] ] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = mul i32 1, [[TMP11]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[TMP20]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP13:%.*]] = icmp ult [[TMP19]], [[BROADCAST_SPLAT6]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp sge [[VEC_IND]], splat (i32 2) -; CHECK-NEXT: [[TMP15:%.*]] = select [[TMP13]], [[TMP14]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP16:%.*]] = shl [[PREDPHI]], splat (i32 8) -; CHECK-NEXT: [[TMP17:%.*]] = trunc [[TMP16]] to -; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0( [[TMP17]], align 1 [[BROADCAST_SPLAT4]], splat (i1 true), i32 [[TMP11]]) +; CHECK-NEXT: [[TMP20:%.*]] = mul i32 1, [[TMP11]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[TMP20]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP14:%.*]] = icmp ult [[TMP13]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp sge [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[TMP16:%.*]] = select [[TMP14]], [[TMP15]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = shl [[PREDPHI]], splat (i32 8) +; CHECK-NEXT: [[TMP19:%.*]] = trunc [[TMP17]] to +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0( [[TMP19]], align 1 [[BROADCAST_SPLAT4]], splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP11]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP11]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 9 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll index da5aed943bc75..ba7005f4f56dc 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll @@ -10,110 +10,35 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) { ; IF-EVL-LABEL: @gather_scatter( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] -; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 -; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]] -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] -; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2 -; IF-EVL-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv2i64() -; IF-EVL-NEXT: [[TMP10:%.*]] = mul [[TMP9]], splat (i64 1) -; IF-EVL-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP10]] -; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] -; IF-EVL: vector.body: -; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[ENTRY]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; IF-EVL-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 1, [[TMP12]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], [[VEC_IND]] -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP14]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv2f32.nxv2p0( align 4 [[TMP15]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_MASKED_GATHER]] -; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], align 4 [[TMP16]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]] -; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] -; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; IF-EVL: middle.block: -; IF-EVL-NEXT: br label [[FOR_END:%.*]] -; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: -; IF-EVL-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ] -; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV1]] +; IF-EVL-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ] +; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV1]] ; IF-EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8 -; IF-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP0]] +; IF-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]] ; IF-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 -; IF-EVL-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP0]] +; IF-EVL-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]] ; IF-EVL-NEXT: store float [[TMP1]], ptr [[ARRAYIDX7]], align 4 ; IF-EVL-NEXT: [[INDVARS_IV_NEXT1]] = add nuw nsw i64 [[INDVARS_IV1]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N:%.*]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY1]] ; IF-EVL: for.end: ; IF-EVL-NEXT: ret void ; ; NO-VP-LABEL: @gather_scatter( ; NO-VP-NEXT: entry: -; NO-VP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; NO-VP-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2 -; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP14]] -; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] -; NO-VP: vector.ph: -; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; NO-VP-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; NO-VP-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 -; NO-VP-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() -; NO-VP-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 1) -; NO-VP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; NO-VP-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP5]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 -; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NO-VP-NEXT: br label [[FOR_BODY:%.*]] -; NO-VP: vector.body: -; NO-VP-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; NO-VP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ] -; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], [[VEC_IND]] -; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP9]], i32 8, splat (i1 true), poison) -; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], [[WIDE_MASKED_GATHER]] -; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv2f32.nxv2p0( [[TMP10]], i32 4, splat (i1 true), poison) -; NO-VP-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], [[WIDE_MASKED_GATHER]] -; NO-VP-NEXT: call void @llvm.masked.scatter.nxv2f32.nxv2p0( [[WIDE_MASKED_GATHER2]], [[TMP11]], i32 4, splat (i1 true)) -; NO-VP-NEXT: [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], [[TMP5]] -; NO-VP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] -; NO-VP-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N_VEC]] -; NO-VP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; NO-VP: middle.block: -; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] -; NO-VP: scalar.ph: -; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] ; NO-VP-NEXT: br label [[FOR_BODY1:%.*]] ; NO-VP: for.body: -; NO-VP-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ] -; NO-VP-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV1]] +; NO-VP-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ] +; NO-VP-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV1]] ; NO-VP-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8 -; NO-VP-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP0]] +; NO-VP-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]] ; NO-VP-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4 -; NO-VP-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP0]] +; NO-VP-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]] ; NO-VP-NEXT: store float [[TMP1]], ptr [[ARRAYIDX7]], align 4 ; NO-VP-NEXT: [[INDVARS_IV_NEXT1]] = add nuw nsw i64 [[INDVARS_IV1]], 1 -; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N]] -; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N:%.*]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY1]] ; NO-VP: for.end: ; NO-VP-NEXT: ret void ;