From 9d154299ba9b0cebd02f35177d5e0a0b50dd209c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 21 Jul 2025 11:47:53 +0100 Subject: [PATCH 1/4] [LV] Also clamp MaxVF by trip count when maximizing vector bandwidth. Also clamp the max VF when maximizing vector bandwidth by the maximum trip count. Otherwise we may end up choosing a VF for which the vector loop never executes. --- .../Transforms/Vectorize/LoopVectorize.cpp | 73 +++++++++++-------- .../AArch64/partial-reduce-dot-product.ll | 35 +++------ 2 files changed, 55 insertions(+), 53 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 773c1559ec679..dc26ef640ff8d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1505,6 +1505,11 @@ class LoopVectorizationCostModel { ElementCount UserVF, bool FoldTailByMasking); + /// If \p VF > MaxTripcount, clamps it to the next lower VF that is <= + /// MaxTripCount. + ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount, + bool FoldTailByMasking) const; + /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. /// This is a helper function of computeFeasibleMaxVF. @@ -3854,6 +3859,38 @@ bool LoopVectorizationCostModel::useMaxBandwidth( Legal->hasVectorCallVariants()))); } +ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount( + ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const { + unsigned EstimatedVF = VF.getKnownMinValue(); + if (VF.isScalable() && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); + auto Min = Attr.getVScaleRangeMin(); + EstimatedVF *= Min; + } + + // When a scalar epilogue is required, at least one iteration of the scalar + // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a + // max VF that results in a dead vector loop. + if (MaxTripCount > 0 && requiresScalarEpilogue(true)) + MaxTripCount -= 1; + + if (MaxTripCount && MaxTripCount <= EstimatedVF && + (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { + // If upper bound loop trip count (TC) is known at compile time there is no + // point in choosing VF greater than TC (as done in the loop below). Select + // maximum power of two which doesn't exceed TC. If VF is + // scalable, we only fall back on a fixed VF when the TC is less than or + // equal to the known number of lanes. + auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); + LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " + "exceeding the constant trip count: " + << ClampedUpperTripCount << "\n"); + return ElementCount::get(ClampedUpperTripCount, + FoldTailByMasking ? VF.isScalable() : false); + } + return VF; +} + ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, ElementCount MaxSafeVF, bool FoldTailByMasking) { @@ -3885,40 +3922,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( return ElementCount::getFixed(1); } - unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); - if (MaxVectorElementCount.isScalable() && - TheFunction->hasFnAttribute(Attribute::VScaleRange)) { - auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); - auto Min = Attr.getVScaleRangeMin(); - WidestRegisterMinEC *= Min; - } - - // When a scalar epilogue is required, at least one iteration of the scalar - // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a - // max VF that results in a dead vector loop. - if (MaxTripCount > 0 && requiresScalarEpilogue(true)) - MaxTripCount -= 1; - - if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && - (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { - // If upper bound loop trip count (TC) is known at compile time there is no - // point in choosing VF greater than TC (as done in the loop below). Select - // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is - // scalable, we only fall back on a fixed VF when the TC is less than or - // equal to the known number of lanes. - auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); - LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " - "exceeding the constant trip count: " - << ClampedUpperTripCount << "\n"); - return ElementCount::get( - ClampedUpperTripCount, - FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); - } + ElementCount MaxVF = clampVFByMaxTripCount(MaxVectorElementCount, + MaxTripCount, FoldTailByMasking); + if (MaxVF != MaxVectorElementCount) + return MaxVF; TargetTransformInfo::RegisterKind RegKind = ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector; - ElementCount MaxVF = MaxVectorElementCount; if (MaxVF.isScalable()) MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF; @@ -3940,6 +3951,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( } } + MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, FoldTailByMasking); + // Invalidate any widening decisions we might have made, in case the loop // requires prediction (decided later), but we have already made some // load/store widening decisions. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 14725e0eb2096..4dec9785fe90a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1904,39 +1904,28 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] -; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP19]] = add [[VEC_PHI]], [[TMP14]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i64> +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP13]] = add <16 x i64> [[VEC_PHI]], [[TMP14]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP19]]) -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP13]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: ; entry: From 1ab9b9265954886ac4e0473650a9dce00f4c34ba Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 21 Jul 2025 13:30:20 +0100 Subject: [PATCH 2/4] !fixup update tests. --- .../AArch64/interleave-with-gaps.ll | 51 ------------------ .../AArch64/partial-reduce-dot-product.ll | 54 +++++++++++-------- 2 files changed, 32 insertions(+), 73 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll index 298ef09904b77..8cfacd2882f20 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -288,56 +288,6 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-LABEL: define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]], ptr noalias [[E:%.*]], ptr noalias [[F:%.*]], ptr noalias [[G:%.*]], ptr noalias [[H:%.*]], ptr noalias [[I:%.*]], ptr noalias [[J:%.*]], ptr noalias [[K:%.*]], ptr [[L:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[J]], i64 0 -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP0]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[STRIDED_VEC]] to <8 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 2 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 6 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[K]], i64 8 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[K]], i64 10 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[K]], i64 12 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[K]], i64 14 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 -; CHECK-NEXT: store i16 [[TMP14]], ptr [[TMP6]], align 2 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 -; CHECK-NEXT: store i16 [[TMP15]], ptr [[TMP7]], align 2 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 -; CHECK-NEXT: store i16 [[TMP16]], ptr [[TMP8]], align 2 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 -; CHECK-NEXT: store i16 [[TMP17]], ptr [[TMP9]], align 2 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 -; CHECK-NEXT: store i16 [[TMP18]], ptr [[TMP10]], align 2 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 -; CHECK-NEXT: store i16 [[TMP19]], ptr [[TMP11]], align 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 -; CHECK-NEXT: store i16 [[TMP20]], ptr [[TMP12]], align 2 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 -; CHECK-NEXT: store i16 [[TMP21]], ptr [[TMP13]], align 2 -; CHECK-NEXT: store i64 0, ptr [[A]], align 8 -; CHECK-NEXT: store i64 0, ptr [[B]], align 8 -; CHECK-NEXT: store i64 0, ptr [[C]], align 8 -; CHECK-NEXT: store i64 0, ptr [[D]], align 8 -; CHECK-NEXT: store i64 0, ptr [[E]], align 8 -; CHECK-NEXT: store i64 0, ptr [[F]], align 8 -; CHECK-NEXT: store i64 0, ptr [[G]], align 8 -; CHECK-NEXT: store i64 0, ptr [[H]], align 8 -; CHECK-NEXT: store i64 0, ptr [[I]], align 8 -; CHECK-NEXT: store i64 0, ptr [[L]], align 8 -; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV]] ; CHECK-NEXT: [[L_J:%.*]] = load i64, ptr [[GEP_J]], align 8 ; CHECK-NEXT: [[L_TRUNC:%.*]] = trunc i64 [[L_J]] to i16 @@ -355,7 +305,6 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-NEXT: store i64 0, ptr [[L]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 2 ; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV]], 14 -; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 4dec9785fe90a..14a73db15b8cc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1816,13 +1816,12 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2 ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1845,7 +1844,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP15]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: ; @@ -1854,13 +1853,13 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]] ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1897,35 +1896,46 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP25]], [[TMP24]] ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i64> -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP2]] -; CHECK-MAXBW-NEXT: [[TMP13]] = add <16 x i64> [[VEC_PHI]], [[TMP14]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul nuw nsw [[TMP17]], [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP14]] = add [[VEC_PHI]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP13]]) -; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP14]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: ; entry: @@ -1943,7 +1953,7 @@ for.body: ; preds = %entry, %for.body %conv3 = zext i8 %1 to i64 %mul = mul nuw nsw i64 %conv3, %conv %add = add i64 %sum, %mul - %exitcond.not = icmp eq i64 %i.iv.next, 16 + %exitcond.not = icmp eq i64 %i.iv.next, 41 br i1 %exitcond.not, label %exit, label %for.body exit: ; preds = %for.body From 46d3728e120d2c81977d9786b3eaffb03cbb1acd Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 22 Jul 2025 13:29:14 +0100 Subject: [PATCH 3/4] !fixup update test after rebase --- .../AArch64/interleave-with-gaps.ll | 55 +++++++++++++++++-- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll index 8cfacd2882f20..5b8acee40d63a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -288,10 +288,56 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-LABEL: define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]], ptr noalias [[E:%.*]], ptr noalias [[F:%.*]], ptr noalias [[G:%.*]], ptr noalias [[H:%.*]], ptr noalias [[I:%.*]], ptr noalias [[J:%.*]], ptr noalias [[K:%.*]], ptr [[L:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-NEXT: [[IV:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6 ; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV]] -; CHECK-NEXT: [[L_J:%.*]] = load i64, ptr [[GEP_J]], align 8 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0 +; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP6]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1 +; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP7]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2 +; CHECK-NEXT: store i64 0, ptr [[A]], align 8 +; CHECK-NEXT: store i64 0, ptr [[B]], align 8 +; CHECK-NEXT: store i64 0, ptr [[C]], align 8 +; CHECK-NEXT: store i64 0, ptr [[D]], align 8 +; CHECK-NEXT: store i64 0, ptr [[E]], align 8 +; CHECK-NEXT: store i64 0, ptr [[F]], align 8 +; CHECK-NEXT: store i64 0, ptr [[G]], align 8 +; CHECK-NEXT: store i64 0, ptr [[H]], align 8 +; CHECK-NEXT: store i64 0, ptr [[I]], align 8 +; CHECK-NEXT: store i64 0, ptr [[L]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_J1:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV1]] +; CHECK-NEXT: [[L_J:%.*]] = load i64, ptr [[GEP_J1]], align 8 ; CHECK-NEXT: [[L_TRUNC:%.*]] = trunc i64 [[L_J]] to i16 -; CHECK-NEXT: [[GEP_K:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]] +; CHECK-NEXT: [[GEP_K:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV1]] ; CHECK-NEXT: store i16 [[L_TRUNC]], ptr [[GEP_K]], align 2 ; CHECK-NEXT: store i64 0, ptr [[A]], align 8 ; CHECK-NEXT: store i64 0, ptr [[B]], align 8 @@ -303,8 +349,9 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( ; CHECK-NEXT: store i64 0, ptr [[H]], align 8 ; CHECK-NEXT: store i64 0, ptr [[I]], align 8 ; CHECK-NEXT: store i64 0, ptr [[L]], align 8 -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 2 -; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV]], 14 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 2 +; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV1]], 14 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; From 0102bdf54d02acd1da5405bf7d424355f8760d28 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 22 Jul 2025 14:25:30 +0100 Subject: [PATCH 4/4] !fixup only invalidate decisions if MaxVF changed after maximizing. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index dc26ef640ff8d..b95db1c43b30c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3924,6 +3924,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( ElementCount MaxVF = clampVFByMaxTripCount(MaxVectorElementCount, MaxTripCount, FoldTailByMasking); + // If the MaxVF was already clamped, there's no point in trying to pick a + // larger one. if (MaxVF != MaxVectorElementCount) return MaxVF; @@ -3953,10 +3955,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( MaxVF = clampVFByMaxTripCount(MaxVF, MaxTripCount, FoldTailByMasking); - // Invalidate any widening decisions we might have made, in case the loop - // requires prediction (decided later), but we have already made some - // load/store widening decisions. - invalidateCostModelingDecisions(); + if (MaxVectorElementCount != MaxVF) { + // Invalidate any widening decisions we might have made, in case the loop + // requires prediction (decided later), but we have already made some + // load/store widening decisions. + invalidateCostModelingDecisions(); + } } return MaxVF; }