Skip to content

Commit 16598e6

Browse files
committed
Transform the gather to stride load
1 parent 9ee98f8 commit 16598e6

File tree

4 files changed

+207
-22
lines changed

4 files changed

+207
-22
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8912,20 +8912,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
89128912
// Adjust the recipes for any inloop reductions.
89138913
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
89148914

8915+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
8916+
CM.CostKind);
89158917
// Transform recipes to abstract recipes if it is legal and beneficial and
89168918
// clamp the range for better cost estimation.
89178919
// TODO: Enable following transform when the EVL-version of extended-reduction
89188920
// and mulacc-reduction are implemented.
8919-
if (!CM.foldTailWithEVL()) {
8920-
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
8921-
CM.CostKind);
8921+
if (!CM.foldTailWithEVL())
89228922
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
89238923
CostCtx, Range);
8924-
}
8925-
8926-
for (ElementCount VF : Range)
8927-
Plan->addVF(VF);
8928-
Plan->setName("Initial VPlan");
89298924

89308925
// Interleave memory: for each Interleave Group we marked earlier as relevant
89318926
// for this VPlan, replace the Recipes widening its memory instructions with a
@@ -8934,6 +8929,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
89348929
InterleaveGroups, RecipeBuilder,
89358930
CM.isScalarEpilogueAllowed());
89368931

8932+
// Convert memory recipes to strided access recipes if the strided access is
8933+
// legal and profitable.
8934+
VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
8935+
CostCtx, Range);
8936+
8937+
for (ElementCount VF : Range)
8938+
Plan->addVF(VF);
8939+
Plan->setName("Initial VPlan");
8940+
89378941
// Replace VPValues for known constant strides guaranteed by predicate scalar
89388942
// evolution.
89398943
auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1644,20 +1644,6 @@ struct VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
16441644

16451645
/// A recipe for handling GEP instructions.
16461646
class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
1647-
bool isPointerLoopInvariant() const {
1648-
return getOperand(0)->isDefinedOutsideLoopRegions();
1649-
}
1650-
1651-
bool isIndexLoopInvariant(unsigned I) const {
1652-
return getOperand(I + 1)->isDefinedOutsideLoopRegions();
1653-
}
1654-
1655-
bool areAllOperandsInvariant() const {
1656-
return all_of(operands(), [](VPValue *Op) {
1657-
return Op->isDefinedOutsideLoopRegions();
1658-
});
1659-
}
1660-
16611647
public:
16621648
VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef<VPValue *> Operands)
16631649
: VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {
@@ -1676,6 +1662,20 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
16761662

16771663
VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
16781664

1665+
bool isPointerLoopInvariant() const {
1666+
return getOperand(0)->isDefinedOutsideLoopRegions();
1667+
}
1668+
1669+
bool isIndexLoopInvariant(unsigned I) const {
1670+
return getOperand(I + 1)->isDefinedOutsideLoopRegions();
1671+
}
1672+
1673+
bool areAllOperandsInvariant() const {
1674+
return all_of(operands(), [](VPValue *Op) {
1675+
return Op->isDefinedOutsideLoopRegions();
1676+
});
1677+
}
1678+
16791679
/// Generate the gep nodes.
16801680
void execute(VPTransformState &State) override;
16811681

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2681,6 +2681,181 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
26812681
R->dissolveToCFGLoop();
26822682
}
26832683

2684+
static std::pair<VPValue *, VPValue *> matchStridedStart(VPValue *CurIndex) {
2685+
if (auto *WidenIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(CurIndex))
2686+
return {WidenIV, WidenIV->getStepValue()};
2687+
2688+
auto *WidenR = dyn_cast<VPWidenRecipe>(CurIndex);
2689+
if (!WidenR || !CurIndex->getUnderlyingValue())
2690+
return {nullptr, nullptr};
2691+
2692+
unsigned Opcode = WidenR->getOpcode();
2693+
// TODO: Support Instruction::Add and Instruction::Or.
2694+
if (Opcode != Instruction::Shl && Opcode != Instruction::Mul)
2695+
return {nullptr, nullptr};
2696+
2697+
// Match the pattern binop(variant, invariant), or binop(invariant, variant)
2698+
// if the binary operator is commutative.
2699+
bool IsLHSUniform = vputils::isSingleScalar(WidenR->getOperand(0));
2700+
if (IsLHSUniform == vputils::isSingleScalar(WidenR->getOperand(1)) ||
2701+
(IsLHSUniform && !Instruction::isCommutative(Opcode)))
2702+
return {nullptr, nullptr};
2703+
unsigned VarIdx = IsLHSUniform ? 1 : 0;
2704+
2705+
auto[Start, Stride] = matchStridedStart(WidenR->getOperand(VarIdx));
2706+
if (!Start)
2707+
return {nullptr, nullptr};
2708+
2709+
SmallVector<VPValue *> StartOps(WidenR->operands());
2710+
StartOps[VarIdx] = Start;
2711+
auto *StartR = new VPReplicateRecipe(WidenR->getUnderlyingInstr(), StartOps,
2712+
/*IsUniform*/ true);
2713+
StartR->insertBefore(WidenR);
2714+
2715+
unsigned InvIdx = VarIdx == 0 ? 1 : 0;
2716+
auto *StrideR =
2717+
new VPInstruction(Opcode, {Stride, WidenR->getOperand(InvIdx)});
2718+
StrideR->insertBefore(WidenR);
2719+
return {StartR, StrideR};
2720+
}
2721+
2722+
static std::pair<VPValue *, VPValue *>
2723+
determineBaseAndStride(VPWidenGEPRecipe *WidenGEP) {
2724+
// Not considered strided if both base pointer and all indices are
2725+
// loop-invariant.
2726+
if (WidenGEP->areAllOperandsInvariant())
2727+
return {nullptr, nullptr};
2728+
2729+
// TODO: Check if the base pointer is strided.
2730+
if (!WidenGEP->isPointerLoopInvariant())
2731+
return {nullptr, nullptr};
2732+
2733+
// Find the only one variant index.
2734+
unsigned VarOp = 0;
2735+
for (unsigned I = 1, E = WidenGEP->getNumOperands(); I < E; I++) {
2736+
if (WidenGEP->isIndexLoopInvariant(I - 1))
2737+
continue;
2738+
2739+
if (VarOp != 0)
2740+
return {nullptr, nullptr};
2741+
VarOp = I;
2742+
}
2743+
2744+
if (VarOp == 0)
2745+
return {nullptr, nullptr};
2746+
2747+
// TODO: Support cases with a variant index in the middle.
2748+
if (VarOp != WidenGEP->getNumOperands() - 1)
2749+
return {nullptr, nullptr};
2750+
2751+
VPValue *VarIndex = WidenGEP->getOperand(VarOp);
2752+
auto[Start, Stride] = matchStridedStart(VarIndex);
2753+
if (!Start)
2754+
return {nullptr, nullptr};
2755+
2756+
SmallVector<VPValue *> Ops(WidenGEP->operands());
2757+
Ops[VarOp] = Start;
2758+
auto *BasePtr = new VPReplicateRecipe(WidenGEP->getUnderlyingInstr(), Ops,
2759+
/*IsUniform*/ true);
2760+
BasePtr->insertBefore(WidenGEP);
2761+
2762+
return {BasePtr, Stride};
2763+
}
2764+
2765+
void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
2766+
VFRange &Range) {
2767+
if (Plan.hasScalarVFOnly())
2768+
return;
2769+
2770+
DenseMap<VPWidenGEPRecipe *, std::pair<VPValue *, VPValue *>> StrideCache;
2771+
SmallVector<VPRecipeBase *> ToErase;
2772+
SmallPtrSet<VPValue *, 4> PossiblyDead;
2773+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2774+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
2775+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2776+
auto *MemR = dyn_cast<VPWidenMemoryRecipe>(&R);
2777+
// TODO: support strided store
2778+
// TODO: support reverse access
2779+
// TODO: transform interleave access into multiple strided accesses
2780+
if (!MemR || !isa<VPWidenLoadRecipe>(MemR) || MemR->isConsecutive())
2781+
continue;
2782+
2783+
auto *Ptr = dyn_cast<VPWidenGEPRecipe>(MemR->getAddr());
2784+
if (!Ptr)
2785+
continue;
2786+
2787+
// Memory cost model requires the pointer operand of memory access
2788+
// instruction.
2789+
Value *PtrUV = Ptr->getUnderlyingValue();
2790+
if (!PtrUV)
2791+
continue;
2792+
2793+
// Try to get base and stride here.
2794+
VPValue *BasePtr, *Stride;
2795+
auto It = StrideCache.find(Ptr);
2796+
if (It != StrideCache.end())
2797+
std::tie(BasePtr, Stride) = It->second;
2798+
else
2799+
std::tie(BasePtr, Stride) = StrideCache[Ptr] =
2800+
determineBaseAndStride(Ptr);
2801+
2802+
// Skip if the memory acces is not a strided accesses.
2803+
if (!BasePtr) {
2804+
assert(!Stride);
2805+
continue;
2806+
}
2807+
assert(Stride);
2808+
2809+
Instruction &Ingredient = MemR->getIngredient();
2810+
Type *ElementTy = getLoadStoreType(&Ingredient);
2811+
2812+
auto IsProfitable = [&](ElementCount VF) -> bool {
2813+
Type *DataTy = toVectorTy(ElementTy, VF);
2814+
const Align Alignment = getLoadStoreAlignment(&Ingredient);
2815+
if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
2816+
return false;
2817+
const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
2818+
const InstructionCost StridedLoadStoreCost =
2819+
Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV,
2820+
MemR->isMasked(), Alignment,
2821+
Ctx.CostKind, &Ingredient);
2822+
return StridedLoadStoreCost < CurrentCost;
2823+
};
2824+
2825+
if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable,
2826+
Range)) {
2827+
PossiblyDead.insert(BasePtr);
2828+
PossiblyDead.insert(Stride);
2829+
continue;
2830+
}
2831+
PossiblyDead.insert(Ptr);
2832+
2833+
// Create a new vector pointer for strided access.
2834+
auto *GEP = dyn_cast<GetElementPtrInst>(PtrUV->stripPointerCasts());
2835+
auto *NewPtr = new VPVectorPointerRecipe(BasePtr, ElementTy, Stride,
2836+
GEP ? GEP->getNoWrapFlags()
2837+
: GEPNoWrapFlags::none(),
2838+
Ptr->getDebugLoc());
2839+
NewPtr->insertBefore(MemR);
2840+
2841+
auto *LoadR = cast<VPWidenLoadRecipe>(MemR);
2842+
auto *StridedLoad = new VPWidenStridedLoadRecipe(
2843+
*cast<LoadInst>(&Ingredient), NewPtr, Stride, &Plan.getVF(),
2844+
LoadR->getMask(), *LoadR, LoadR->getDebugLoc());
2845+
StridedLoad->insertBefore(LoadR);
2846+
LoadR->replaceAllUsesWith(StridedLoad);
2847+
2848+
ToErase.push_back(LoadR);
2849+
}
2850+
}
2851+
2852+
// Clean up dead memory access recipes, and unused base address and stride.
2853+
for (auto *R : ToErase)
2854+
R->eraseFromParent();
2855+
for (auto *V : PossiblyDead)
2856+
recursivelyDeleteDeadRecipes(V);
2857+
}
2858+
26842859
void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
26852860
Type &CanonicalIVTy) {
26862861
using namespace llvm::VPlanPatternMatch;

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,12 @@ struct VPlanTransforms {
175175
&InterleaveGroups,
176176
VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed);
177177

178+
/// Transform widen memory recipes into strided access recipes when legal
179+
/// and profitable. Clamps \p Range to maintain consistency with widen
180+
/// decisions of \p Plan, and uses \p Ctx to evaluate the cost.
181+
static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
182+
VFRange &Range);
183+
178184
/// Remove dead recipes from \p Plan.
179185
static void removeDeadRecipes(VPlan &Plan);
180186

0 commit comments

Comments
 (0)