Skip to content
Merged
15 changes: 8 additions & 7 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -3111,10 +3111,11 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
/// using the address to load from, the explicit vector length and an optional
/// mask.
struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
{L.getAddr(), &EVL}, L.isConsecutive(),
L.isReverse(), L, L.getDebugLoc()),
{Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
Expand Down Expand Up @@ -3192,11 +3193,11 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
/// using the value to store, the address to store to, the explicit vector
/// length and an optional mask.
struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr, VPValue &EVL,
VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{S.getAddr(), S.getStoredValue(), &EVL},
S.isConsecutive(), S.isReverse(), S,
S.getDebugLoc()) {
{Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
S.isReverse(), S, S.getDebugLoc()) {
setMask(Mask);
}

Expand Down
31 changes: 28 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2137,6 +2137,8 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPRecipeBase &CurRecipe,
VPTypeAnalysis &TypeInfo,
VPValue &AllOneMask, VPValue &EVL) {
// FIXME: Don't transform recipes to EVL recipes if they're not masked by the
// header mask.
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
assert(OrigMask && "Unmasked recipe when folding tail");
// HeaderMask will be handled using EVL.
Expand All @@ -2146,14 +2148,35 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return HeaderMask == OrigMask ? nullptr : OrigMask;
};

/// Adjust any end pointers so that they point to the end of EVL lanes not VF.
auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
if (!EndPtr)
return Addr;
assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF() &&
"VPVectorEndPointerRecipe with non-VF VF operand?");
assert(
all_of(EndPtr->users(),
[](VPUser *U) {
return cast<VPWidenMemoryRecipe>(U)->isReverse();
}) &&
"VPVectorEndPointRecipe not used by reversed widened memory recipe?");
VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
EVLAddr->insertBefore(&CurRecipe);
EVLAddr->setOperand(1, &EVL);
return EVLAddr;
};

return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
.Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
VPValue *NewMask = GetNewMask(L->getMask());
return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
VPValue *NewAddr = GetNewAddr(L->getAddr());
return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
})
.Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
VPValue *NewMask = GetNewMask(S->getMask());
return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
VPValue *NewAddr = GetNewAddr(S->getAddr());
return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
})
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
Expand Down Expand Up @@ -2188,7 +2211,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
VPWidenIntOrFpInductionRecipe>) &&
"User of VF that we can't transform to EVL.");
Plan.getVF().replaceAllUsesWith(&EVL);
Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
});

assert(all_of(Plan.getVFxUF().users(),
[&Plan](VPUser *U) {
Expand Down
132 changes: 132 additions & 0 deletions llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
Original file line number Diff line number Diff line change
Expand Up @@ -981,3 +981,135 @@ exit:

!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.enable", i1 true}


define void @interleave_reverse(ptr noalias %a, ptr noalias %b) {
; IF-EVL-LABEL: @interleave_reverse(
; IF-EVL-NEXT: entry:
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
; IF-EVL-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; IF-EVL-NEXT: [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 -1)
; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> splat (i64 1023), [[TMP6]]
; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY1]] ]
; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY1]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ 1023, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY1]] ]
; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; IF-EVL-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 -1, [[TMP8]]
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; IF-EVL-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 0, [[TMP14]]
; IF-EVL-NEXT: [[TMP16:%.*]] = sub i64 [[TMP14]], 1
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP16]]
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP15]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP17]]
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP7]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP17:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
; IF-EVL: scalar.ph:
; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
; IF-EVL: for.body:
; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ 1023, [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY]] ]
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV1]], i32 0
; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV1]], i32 1
; IF-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
; IF-EVL-NEXT: [[IV_NEXT1]] = add nsw i64 [[IV1]], -1
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], 0
; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; IF-EVL: for.cond.cleanup:
; IF-EVL-NEXT: ret void
;
; NO-VP-LABEL: @interleave_reverse(
; NO-VP-NEXT: entry:
; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; NO-VP: vector.ph:
; NO-VP-NEXT: br label [[FOR_BODY1:%.*]]
; NO-VP: vector.body:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ]
; NO-VP-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY1]] ]
; NO-VP-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
; NO-VP-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <4 x i64> [[VEC_IND]], i32 0
; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <4 x i64> [[STEP_ADD]], i32 0
; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP12]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP13]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <4 x i64> [[VEC_IND]], i32 1
; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <4 x i64> [[STEP_ADD]], i32 1
; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
; NO-VP-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
; NO-VP-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER2]], [[WIDE_MASKED_GATHER]]
; NO-VP-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER1]]
; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 -3
; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -4
; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 -3
; NO-VP-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; NO-VP-NEXT: store <4 x i32> [[REVERSE]], ptr [[TMP8]], align 4
; NO-VP-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; NO-VP-NEXT: store <4 x i32> [[REVERSE4]], ptr [[TMP10]], align 4
; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; NO-VP-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP16:![0-9]+]]
; NO-VP: middle.block:
; NO-VP-NEXT: br label [[SCALAR_PH]]
; NO-VP: scalar.ph:
; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
; NO-VP: for.body:
; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
; NO-VP-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
; NO-VP-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; NO-VP: for.cond.cleanup:
; NO-VP-NEXT: ret void
;
entry:
br label %for.body

for.body:
%iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
%arrayidx = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 0
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 1
%1 = load i32, ptr %arrayidx2
%add = add nsw i32 %1, %0
%arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
store i32 %add, ptr %arrayidx4
%iv.next = add nsw i64 %iv, -1
%exitcond.not = icmp eq i64 %iv.next, 0
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup:
ret void
}
Loading