Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 31 additions & 6 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7079,6 +7079,20 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
return true;

if (WidenMemR->isReverse()) {
// If the stored value of a reverse store is invariant, LICM will
// hoist the reverse operation to the preheader. In this case, the
// result of the VPlan-based cost model will diverge from that of
// the legacy model.
if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
return true;

if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
return true;
}
}

/// If a VPlan transform folded a recipe to one producing a single-scalar,
Expand Down Expand Up @@ -7610,8 +7624,8 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}

VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
VFRange &Range) {
VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
VFRange &Range) {
assert((VPI->getOpcode() == Instruction::Load ||
VPI->getOpcode() == Instruction::Store) &&
"Must be called with either a load or store");
Expand Down Expand Up @@ -7672,15 +7686,26 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}

if (VPI->getOpcode() == Instruction::Load) {
auto *Load = cast<LoadInst>(I);
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, *VPI,
VPI->getDebugLoc());
auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
*VPI, Load->getDebugLoc());
if (Reverse) {
Builder.insert(LoadR);
return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
LoadR->getDebugLoc());
}
return LoadR;
}

StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
Consecutive, Reverse, *VPI, VPI->getDebugLoc());
VPValue *StoredVal = VPI->getOperand(0);
if (Reverse)
StoredVal = Builder.createNaryOp(VPInstruction::Reverse, StoredVal,
Store->getDebugLoc());
return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
Reverse, *VPI, Store->getDebugLoc());
}

/// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class VPRecipeBuilder {
/// Check if the load or store instruction \p VPI should widened for \p
/// Range.Start and potentially masked. Such instructions are handled by a
/// recipe that takes an additional VPInstruction for the mask.
VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);

/// Check if an induction recipe should be constructed for \p VPI. If so build
/// and return it. If not, return null.
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1109,6 +1109,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// Unrolling will add all copies of its original operand as additional
// operands.
LastActiveLane,
// Returns a reversed vector for the operand.
Reverse,

// The opcodes below are used for VPInstructionWithType.
//
Expand Down Expand Up @@ -3468,6 +3470,15 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
setMask(Mask);
}

VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr,
VPValue *StoredVal, VPValue &EVL, VPValue *Mask)
Comment on lines +3473 to +3474
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the only addition here is the stored value, right? can we unify to a single constructor, and clarify that VPWidenStoreRecipe is only used for other fields?

: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
{Addr, StoredVal, &EVL}, S.isConsecutive(),
S.isReverse(), S, S.getDebugLoc()) {
assert(isReverse() && "Only reverse access need to set new stored value");
setMask(Mask);
}

VP_CLASSOF_IMPL(VPDef::VPWidenStoreEVLSC)

/// Return the address accessed by this recipe.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
case VPInstruction::Reverse:
// Return the type based on first operand.
return inferScalarType(R->getOperand(0));
case VPInstruction::BranchOnCond:
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,12 @@ m_LastActiveLane(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::LastActiveLane>(Op0);
}

template <typename Op0_t>
inline VPInstruction_match<VPInstruction::Reverse, Op0_t>
m_Reverse(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::Reverse>(Op0);
}

inline VPInstruction_match<VPInstruction::StepVector> m_StepVector() {
return m_VPInstruction<VPInstruction::StepVector>();
}
Expand Down
77 changes: 38 additions & 39 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::Not:
case VPInstruction::ResumeForEpilogue:
case VPInstruction::Reverse:
case VPInstruction::Unpack:
return 1;
case Instruction::ICmp:
Expand Down Expand Up @@ -917,6 +918,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::ResumeForEpilogue:
return State.get(getOperand(0), true);
case VPInstruction::Reverse:
return Builder.CreateVectorReverse(State.get(getOperand(0)), "reverse");
default:
llvm_unreachable("Unsupported opcode for instruction");
}
Expand Down Expand Up @@ -1103,6 +1106,14 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::Reverse: {
assert(VF.isVector() && "Reverse operation must be vector type");
auto *VectorTy = cast<VectorType>(
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
VectorTy, /*Mask=*/{}, Ctx.CostKind,
/*Index=*/0);
}
case VPInstruction::ExtractLastLane: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
Expand Down Expand Up @@ -1205,6 +1216,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::WidePtrAdd:
case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
case VPInstruction::Reverse:
case VPInstruction::VScale:
case VPInstruction::Unpack:
return false;
Expand Down Expand Up @@ -1382,6 +1394,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
case VPInstruction::Reverse:
O << "reverse";
break;
case VPInstruction::Unpack:
O << "unpack";
break;
Expand Down Expand Up @@ -2260,18 +2275,32 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
VPValue *Operand = getOperand(0);
TTI::CastContextHint CCH = TTI::CastContextHint::None;
// For Trunc/FPTrunc, get the context from the only user.
if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
!hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(*user_begin()))
CCH = ComputeCCH(StoreRecipe);
if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
return nullptr;
return dyn_cast<VPRecipeBase>(*R->user_begin());
};

if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
if (match(Recipe, m_Reverse(m_VPValue())))
Recipe = GetOnlyUser(cast<VPInstruction>(Recipe));
if (Recipe)
CCH = ComputeCCH(Recipe);
}
}
// For Z/Sext, get the context from the operand.
else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
Opcode == Instruction::FPExt) {
Comment on lines 2293 to 2294
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand why the code is guarded by Trunc, FPTrunc, FPExt, ZExt, or SExt opcodes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is used to determine TTI::CastContextHint. Perhaps for other cast opcodes, there isn’t currently a situation that requires this hint. But I think that’s a separate issue. The change here is just to ensure that CastContextHint::Reversed is still correctly propagated after the reverse operation is separated out.

if (Operand->isLiveIn())
CCH = TTI::CastContextHint::Normal;
else if (Operand->getDefiningRecipe())
CCH = ComputeCCH(Operand->getDefiningRecipe());
else if (auto *Recipe = Operand->getDefiningRecipe()) {
VPValue *ReverseOp;
if (match(Recipe, m_Reverse(m_VPValue(ReverseOp))))
Recipe = ReverseOp->getDefiningRecipe();
if (Recipe)
CCH = ComputeCCH(Recipe);
}
}

auto *SrcTy =
Expand Down Expand Up @@ -3519,12 +3548,7 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
Cost += Ctx.TTI.getMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind,
OpInfo, &Ingredient);
}
if (!Reverse)
return Cost;

return Cost += Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
return Cost;
}

void VPWidenLoadRecipe::execute(VPTransformState &State) {
Expand Down Expand Up @@ -3555,8 +3579,6 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
NewLI = Builder.CreateAlignedLoad(DataTy, Addr, Alignment, "wide.load");
}
applyMetadata(*cast<Instruction>(NewLI));
if (Reverse)
NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
State.set(this, NewLI);
}

Expand Down Expand Up @@ -3611,8 +3633,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
Instruction *Res = NewLI;
if (isReverse())
Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
State.set(this, Res);
}

Expand All @@ -3629,15 +3649,9 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
InstructionCost Cost = Ctx.TTI.getMemIntrinsicInstrCost(
return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
Ctx.CostKind);
if (!Reverse)
return Cost;

return Cost + Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down Expand Up @@ -3666,13 +3680,6 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}

Value *StoredVal = State.get(StoredVPValue);
if (isReverse()) {
// If we store to reverse consecutive memory locations, then we need
// to reverse the order of elements in the stored value.
StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
// We don't want to update the value in the map as it might be used in
// another expression. So don't call resetVectorValue(StoredVal).
}
Value *Addr = State.get(getAddr(), /*IsScalar*/ !CreateScatter);
Instruction *NewSI = nullptr;
if (CreateScatter)
Expand Down Expand Up @@ -3701,8 +3708,6 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
if (isReverse())
StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
Expand Down Expand Up @@ -3739,15 +3744,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
InstructionCost Cost = Ctx.TTI.getMemIntrinsicInstrCost(
return Ctx.TTI.getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
Ctx.CostKind);
if (!Reverse)
return Cost;

return Cost + Ctx.TTI.getShuffleCost(
TargetTransformInfo::SK_Reverse, cast<VectorType>(Ty),
cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down
34 changes: 26 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2858,25 +2858,43 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
EVL, Mask);

if (match(&CurRecipe,
VPValue *ReversedVal;
if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
match(ReversedVal,
m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe),
AdjustEndPtr(EndPtr), EVL, Mask);
cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
auto *LoadR = new VPWidenLoadEVLRecipe(
*cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
LoadR->insertBefore(&CurRecipe);
return new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
TypeInfo.inferScalarType(LoadR), {}, {}, DL);
}

if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(),
m_RemoveMask(HeaderMask, Mask))) &&
!cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
EVL, Mask);

if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(),
VPValue *StoredVal;
if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
Comment on lines +2881 to +2882
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this just match the reverse in the same line so you don't need the separate match on line 2886

Suggested change
VPValue *StoredVal;
if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_VPValue(StoredVal),
if (match(&CurRecipe, m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),

m_RemoveMask(HeaderMask, Mask))) &&
match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
AdjustEndPtr(EndPtr), EVL, Mask);
cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
if (match(StoredVal, m_Reverse(m_VPValue(ReversedVal)))) {
auto *NewReverse = new VPWidenIntrinsicRecipe(
Intrinsic::experimental_vp_reverse,
{ReversedVal, Plan->getTrue(), &EVL},
TypeInfo.inferScalarType(ReversedVal), {}, {},
cast<VPInstruction>(StoredVal)->getDebugLoc());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can just use CurRecipe's debugloc, since the original reverse is created with the store's debugloc?

Suggested change
cast<VPInstruction>(StoredVal)->getDebugLoc());
CurRecipe.getDebugLoc());

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we have tests with debug location for EVL transform, would be good to add some.

NewReverse->insertBefore(&CurRecipe);
return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
AdjustEndPtr(EndPtr), NewReverse, EVL,
Mask);
}
}

if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
if (Rdx->isConditional() &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE9]]

entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -24
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -56
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x double> [[WIDE_LOAD1]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP5:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = fcmp une <4 x double> [[REVERSE2]], zeroinitializer
Expand Down
Loading