From 450570470e4f29c7230243df87b8723b8ce6cb5b Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 24 Jul 2025 00:21:01 +0800 Subject: [PATCH 1/3] [VPlan] Fold common edges away in convertPhisToBlends If a phi is widened with tail folding, all of its predecessors will have a mask of the form %x = logical-and %active-lane-mask, %foo %y = logical-and %active-lane-mask, %bar %z = logical-and %active-lane-mask, %baz ... We can remove the common %active-lane-mask from all of these edge masks, which in turn simplifies a lot of the masks. This allows us to remove VPBlendRecipe and directly emit VPInstruction::Select in another patch. --- .../Transforms/Vectorize/VPlanPredicator.cpp | 28 +++++++++++++++ .../AArch64/masked-call-scalarize.ll | 15 ++++---- .../LoopVectorize/AArch64/masked-call.ll | 34 ++++++++----------- .../AArch64/scalable-strict-fadd.ll | 2 +- .../AArch64/sve-tail-folding-reductions.ll | 2 +- .../LoopVectorize/AArch64/sve-tail-folding.ll | 5 +-- .../AArch64/tail-fold-uniform-memops.ll | 5 +-- .../Transforms/LoopVectorize/RISCV/pr88802.ll | 6 ++-- .../LoopVectorize/RISCV/uniform-load-store.ll | 4 +-- ...rize-force-tail-with-evl-cond-reduction.ll | 32 +++-------------- ...bounds-flags-for-reverse-vector-pointer.ll | 5 ++- .../predicatedinst-loop-invariant.ll | 11 +++--- .../Transforms/LoopVectorize/uniform-blend.ll | 16 ++++----- .../vplan-sink-scalars-and-merge.ll | 6 ++-- 14 files changed, 80 insertions(+), 91 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index f0cab79197b4d..2ab7833902954 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -14,6 +14,7 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanCFG.h" +#include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanUtils.h" #include "llvm/ADT/PostOrderIterator.h" @@ -65,6 +66,10 @@ class VPPredicator { return EdgeMaskCache[{Src, Dst}] = Mask; } + /// Given a widened phi \p PhiR, try to see if its incoming blocks all share a + /// common edge and return its mask. + VPValue *findCommonEdgeMask(const VPWidenPHIRecipe *PhiR) const; + public: /// Returns the precomputed predicate of the edge from \p Src to \p Dst. VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { @@ -227,6 +232,20 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { setEdgeMask(Src, DefaultDst, DefaultMask); } +VPValue *VPPredicator::findCommonEdgeMask(const VPWidenPHIRecipe *PhiR) const { + using namespace llvm::VPlanPatternMatch; + VPValue *EdgeMask = getEdgeMask(PhiR->getIncomingBlock(0), PhiR->getParent()); + VPValue *CommonEdgeMask; + if (!EdgeMask || + !match(EdgeMask, m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue()))) + return nullptr; + for (unsigned In = 1; In < PhiR->getNumIncoming(); In++) + if (!match(getEdgeMask(PhiR->getIncomingBlock(In), PhiR->getParent()), + m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue()))) + return nullptr; + return CommonEdgeMask; +} + void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { SmallVector Phis; for (VPRecipeBase &R : VPBB->phis()) @@ -238,6 +257,7 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { // be duplications since this is a simple recursive scan, but future // optimizations will clean it up. + VPValue *CommonEdgeMask = findCommonEdgeMask(PhiR); SmallVector OperandsWithMask; unsigned NumIncoming = PhiR->getNumIncoming(); for (unsigned In = 0; In < NumIncoming; In++) { @@ -250,6 +270,14 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { "Distinct incoming values with one having a full mask"); break; } + + // If all incoming blocks share a common edge, remove it from the mask. + using namespace llvm::VPlanPatternMatch; + VPValue *X; + if (match(EdgeMask, + m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue(X)))) + EdgeMask = X; + OperandsWithMask.push_back(EdgeMask); } PHINode *IRPhi = cast(PhiR->getUnderlyingValue()); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll index 078f98f54525b..c0f044583e48d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll @@ -68,9 +68,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFCOMMON-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR3]] ; TFCOMMON-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 ; TFCOMMON-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1 -; TFCOMMON-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP8]], zeroinitializer -; TFCOMMON-NEXT: [[TMP11:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer -; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer +; TFCOMMON-NEXT: [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer +; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) ; TFCOMMON-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 ; TFCOMMON-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; TFCOMMON: pred.store.if: @@ -115,12 +114,10 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR3]] ; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0 ; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP10]], i32 1 -; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = fcmp ule <2 x double> [[TMP8]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = fcmp ule <2 x double> [[TMP12]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK2]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer -; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer -; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP18]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <2 x double> [[TMP12]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP13]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) +; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP14]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) ; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 ; TFA_INTERLEAVE-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; TFA_INTERLEAVE: pred.store.if: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 6029095bbe7b1..543d29700bc4c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -195,7 +195,7 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[TMP6:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], splat (i64 50) ; TFCOMMON-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer ; TFCOMMON-NEXT: [[TMP8:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP7]]) -; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP8]], zeroinitializer +; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP6]], [[TMP8]], zeroinitializer ; TFCOMMON-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] ; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] @@ -232,8 +232,8 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP12]], zeroinitializer ; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP13]]) ; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[TMP14]]) -; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], [[TMP15]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP14]], [[TMP16]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[TMP15]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP12]], [[TMP16]], zeroinitializer ; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] ; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 2 @@ -354,7 +354,7 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[TMP9:%.*]] = call @foo_vector( zeroinitializer, [[TMP8]]) ; TFCOMMON-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer ; TFCOMMON-NEXT: [[TMP11:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP10]]) -; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], [[TMP11]], [[TMP9]] +; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select [[TMP6]], [[TMP11]], [[TMP9]] ; TFCOMMON-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] ; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] @@ -397,8 +397,8 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK2]], [[TMP12]], zeroinitializer ; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[TMP19]]) ; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD3]], [[TMP20]]) -; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP19]], [[TMP21]], [[TMP17]] -; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP20]], [[TMP22]], [[TMP18]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[TMP21]], [[TMP17]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select [[TMP12]], [[TMP22]], [[TMP18]] ; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] ; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 2 @@ -996,31 +996,27 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]] ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: -; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP19:.*]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP19]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP19]] ] +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP18:.*]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP18]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP18]] ] ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]] ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]] ; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00 ; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = xor i1 [[TMP7]], true -; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = xor i1 [[TMP8]], true -; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP9]], i1 false -; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP10]], i1 false -; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP11]], double 1.000000e+00, double 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP12]], double 1.000000e+00, double 0.000000e+00 +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP7]], double 0.000000e+00, double 1.000000e+00 +; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP8]], double 0.000000e+00, double 1.000000e+00 ; TFA_INTERLEAVE-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], double [[PREDPHI3]], double [[PREDPHI]] ; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = xor i1 [[ACTIVE_LANE_MASK]], true ; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = xor i1 [[ACTIVE_LANE_MASK2]], true ; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor i1 [[TMP13]], true ; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor i1 [[TMP14]], true ; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]] -; TFA_INTERLEAVE-NEXT: br i1 [[TMP17]], label %[[BB18:.*]], label %[[TMP19]] -; TFA_INTERLEAVE: [[BB18]]: +; TFA_INTERLEAVE-NEXT: br i1 [[TMP17]], label %[[BB14:.*]], label %[[TMP18]] +; TFA_INTERLEAVE: [[BB14]]: ; TFA_INTERLEAVE-NEXT: store double [[SPEC_SELECT]], ptr [[P]], align 8 -; TFA_INTERLEAVE-NEXT: br label %[[TMP19]] -; TFA_INTERLEAVE: [[TMP19]]: +; TFA_INTERLEAVE-NEXT: br label %[[TMP18]] +; TFA_INTERLEAVE: [[TMP18]]: ; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1 ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index d73cdc1228fed..c1a6e4c162bb0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -1151,7 +1151,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-TF-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]] ; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[TMP14]], i32 0 ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, [[TMP13]], poison) -; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], [[WIDE_MASKED_LOAD1]], splat (float 3.000000e+00) +; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select [[TMP12]], [[WIDE_MASKED_LOAD1]], splat (float 3.000000e+00) ; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], splat (float -0.000000e+00) ; CHECK-ORDERED-TF-NEXT: [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP16]]) ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll index 2de24b0f654d5..e8700a495ddf5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll @@ -289,7 +289,7 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, [[TMP15]], poison) ; CHECK-NEXT: [[TMP17:%.*]] = xor [[VEC_PHI]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP17]], [[VEC_PHI]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], [[TMP17]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP20]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index f6f8895c2c70d..322c4ce6d477e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -378,10 +378,11 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX1]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP14:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], splat (i1 true) ; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP15]], poison) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[WIDE_MASKED_GATHER]], zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], zeroinitializer, [[WIDE_MASKED_GATHER]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX1]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[PREDPHI]], ptr [[TMP17]], i32 4, [[ACTIVE_LANE_MASK]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll index f6443b1e28ba6..1db911230a26a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -60,8 +60,9 @@ define void @cond_uniform_load(ptr nocapture %dst, ptr nocapture readonly %src, ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] ; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{%.*}}, i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[COND_LOAD]], zeroinitializer -; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer ; CHECK-NEXT: call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> poison) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll index 02288112f3389..ef154fea0ca67 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll @@ -38,10 +38,8 @@ define void @test(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: [[TMP12:%.*]] = mul i32 1, [[TMP11]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = icmp ule [[VEC_IND]], splat (i32 8) -; CHECK-NEXT: [[TMP14:%.*]] = icmp sge [[VEC_IND]], splat (i32 2) -; CHECK-NEXT: [[TMP15:%.*]] = select [[TMP13]], [[TMP14]], zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp slt [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP16:%.*]] = shl [[PREDPHI]], splat (i32 8) ; CHECK-NEXT: [[TMP17:%.*]] = trunc [[TMP16]] to ; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0( [[TMP17]], align 1 [[BROADCAST_SPLAT4]], splat (i1 true), i32 [[TMP11]]) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index c6955f11d8520..2b7e56dcfff94 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -399,11 +399,9 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]] ; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 ; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ule [[VEC_IND]], splat (i64 1024) ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], splat (i64 10) -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP10]], zeroinitializer ; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i64.nxv4p0( align 8 [[BROADCAST_SPLAT]], [[TMP10]], i32 [[TMP7]]) -; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[WIDE_MASKED_GATHER]], zeroinitializer +; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], [[WIDE_MASKED_GATHER]], zeroinitializer ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 ; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0( [[PREDPHI]], ptr align 8 [[TMP13]], splat (i1 true), i32 [[TMP7]]) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 2926011857ae9..a9eb0a22fdc46 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -262,12 +262,9 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 ; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL-OUTLOOP: vector.body: ; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -275,19 +272,12 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true) -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = add zeroinitializer, [[TMP13]] -; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP14]] -; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] ; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sle [[VP_OP_LOAD]], splat (i32 3) +; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = icmp sgt [[VP_OP_LOAD]], splat (i32 3) ; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = add [[VEC_PHI]], [[VP_OP_LOAD]] -; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = select [[TMP15]], [[TMP18]], zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[PREDPHI1:%.*]] = select [[TMP21]], [[VEC_PHI]], [[TMP19]] +; IF-EVL-OUTLOOP-NEXT: [[PREDPHI1:%.*]] = select [[TMP12]], [[TMP19]], [[VEC_PHI]] ; IF-EVL-OUTLOOP-NEXT: [[PREDPHI]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[PREDPHI1]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP11]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] @@ -790,15 +780,9 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4 ; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[START]], i32 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = mul [[TMP10]], splat (i64 1) -; IF-EVL-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] ; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv4i32() ; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = mul [[TMP12]], splat (i32 1) ; IF-EVL-OUTLOOP-NEXT: [[INDUCTION1:%.*]] = add zeroinitializer, [[TMP13]] @@ -806,7 +790,6 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP: vector.body: ; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-OUTLOOP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[VEC_IND2:%.*]] = phi [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[IV]] @@ -814,23 +797,16 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = mul i32 1, [[TMP14]] ; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP15]], i64 0 ; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 -; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = mul i64 1, [[TMP16]] -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP17]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp ule [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], splat (i1 true), i32 [[TMP14]]) -; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = icmp sle [[VP_OP_LOAD]], [[VEC_IND2]] +; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp sgt [[VP_OP_LOAD]], [[VEC_IND2]] ; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = add [[VEC_PHI]], [[VP_OP_LOAD]] -; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = select [[TMP18]], [[TMP21]], zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[PREDPHI:%.*]] = select [[TMP23]], [[VEC_PHI]], [[TMP22]] +; IF-EVL-OUTLOOP-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], [[TMP22]], [[VEC_PHI]] ; IF-EVL-OUTLOOP-NEXT: [[TMP24]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[PREDPHI]], [[VEC_PHI]], i32 [[TMP14]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP25:%.*]] = zext i32 [[TMP14]] to i64 ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[IV]] ; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] ; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT7]] = add [[VEC_IND2]], [[BROADCAST_SPLAT4]] ; IF-EVL-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll index 1249df4af62ad..09e149c4c813f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll @@ -21,7 +21,7 @@ define i1 @fn(ptr %nno) #0 { ; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 10) ; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP23]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 -3 @@ -30,8 +30,7 @@ define i1 @fn(ptr %nno) #0 { ; CHECK-NEXT: [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i32> [[REVERSE1]], splat (i32 1) ; CHECK-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], splat (i32 10) -; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[REVERSE1]], <4 x i32> [[TMP8]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[REVERSE1]] ; CHECK-NEXT: [[TMP11]] = or <4 x i32> [[PREDPHI]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll index 0a975108edeed..01223c3d0f3b8 100644 --- a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll +++ b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll @@ -19,10 +19,8 @@ define void @loop_invariant_store(ptr %p, i64 %a, i8 %b) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IND]], splat (i32 8) -; CHECK-NEXT: [[TMP5:%.*]] = icmp sge <4 x i32> [[VEC_IND]], splat (i32 2) -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> [[TMP2]] ; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i32> [[PREDPHI]], splat (i32 8) ; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i8> ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i8> [[TMP8]], i32 3 @@ -107,9 +105,8 @@ define void @loop_invariant_srem(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IND:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], ; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IND]], splat (i32 8) -; CHECK-NEXT: [[TMP5:%.*]] = icmp sge <4 x i8> [[VEC_IND1]], splat (i8 2) -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i8> [[VEC_IND1]], splat (i8 2) +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP3]], <4 x i32> [[TMP2]] ; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i32> [[PREDPHI]], splat (i32 8) ; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i8> ; CHECK-NEXT: [[TMP11:%.*]] = srem <4 x i8> [[VEC_IND1]], [[TMP8]] diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll index 67d99dc733624..7b1a9458db315 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll @@ -131,28 +131,26 @@ define void @blend_chain_iv(i1 %c) { ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) -; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i64> undef, <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[PREDPHI:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI]], <4 x i64> undef -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0 +; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> undef +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 3 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP7]] ; CHECK-NEXT: store i16 0, ptr [[TMP2]], align 2 ; CHECK-NEXT: store i16 0, ptr [[TMP4]], align 2 ; CHECK-NEXT: store i16 0, ptr [[TMP6]], align 2 ; CHECK-NEXT: store i16 0, ptr [[TMP8]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[PREDPHI]], splat (i64 4) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 4a4bda254bf88..994e9c1ce64fa 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -366,7 +366,7 @@ define void @pred_cfg1(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]> +; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/ir<%c.1> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -465,7 +465,7 @@ define void @pred_cfg2(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]> +; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/ir<%c.0> ; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.1> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: @@ -571,7 +571,7 @@ define void @pred_cfg3(i32 %k, i32 %j) { ; CHECK-NEXT: Successor(s): then.0.0 ; CHECK-EMPTY: ; CHECK-NEXT: then.0.0: -; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]> +; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/ir<%c.0> ; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.0> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: From 7af813700c271beded7c8f6815d3e27c21a4ad9d Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 31 Jul 2025 11:55:26 +0800 Subject: [PATCH 2/3] Check if CommonEdgeMask is nullptr rather than relying on m_Specific --- llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 7c6fa5ac0d4de..7555af9a47209 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -271,11 +271,13 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { } // If all incoming blocks share a common edge, remove it from the mask. - using namespace llvm::VPlanPatternMatch; - VPValue *X; - if (match(EdgeMask, - m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue(X)))) - EdgeMask = X; + if (CommonEdgeMask) { + using namespace llvm::VPlanPatternMatch; + VPValue *X; + if (match(EdgeMask, + m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue(X)))) + EdgeMask = X; + } OperandsWithMask.push_back(EdgeMask); } From 704d77ef791b6cd289413dd3079bbe4c485e4cc0 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 6 Aug 2025 17:50:08 +0800 Subject: [PATCH 3/3] Move using VPlanPatternMatch out --- llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index 7555af9a47209..2b2f57ffa7ad2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/PostOrderIterator.h" using namespace llvm; +using namespace VPlanPatternMatch; namespace { class VPPredicator { @@ -232,7 +233,6 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) { } VPValue *VPPredicator::findCommonEdgeMask(const VPWidenPHIRecipe *PhiR) const { - using namespace llvm::VPlanPatternMatch; VPValue *EdgeMask = getEdgeMask(PhiR->getIncomingBlock(0), PhiR->getParent()); VPValue *CommonEdgeMask; if (!EdgeMask || @@ -272,7 +272,6 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) { // If all incoming blocks share a common edge, remove it from the mask. if (CommonEdgeMask) { - using namespace llvm::VPlanPatternMatch; VPValue *X; if (match(EdgeMask, m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue(X))))