diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index bc4500eb55a4f..69a2b9652572b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4173,6 +4173,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenIntOrFpInductionSC: case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: + case VPDef::VPInterleaveEVLSC: case VPDef::VPInterleaveSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: @@ -4201,8 +4202,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, // If no def nor is a store, e.g., branches, continue - no value to check. if (R.getNumDefinedValues() == 0 && - !isa( - &R)) + !isa(&R)) continue; // For multi-def recipes, currently only interleaved loads, suffice to // check first def only. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 11a7d8b339ae9..f2c8ede409995 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -557,6 +557,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: + case VPRecipeBase::VPInterleaveEVLSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: case VPRecipeBase::VPWidenLoadEVLSC: @@ -2387,11 +2388,14 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe { } }; -/// VPInterleaveRecipe is a recipe for transforming an interleave group of load -/// or stores into one wide load/store and shuffles. The first operand of a -/// VPInterleave recipe is the address, followed by the stored values, followed -/// by an optional mask. -class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase { +/// A common base class for interleaved memory operations. +/// Interleaved memory operation is a memory access method that combines +/// multiple strided loads/stores into a single wide load/store with shuffles. +/// The first operand must be the address. The optional operands are, in order, +/// the stored values and the mask. +/// TODO: Inherit from VPIRMetadata +class LLVM_ABI_FOR_TEST VPInterleaveBase : public VPRecipeBase { +protected: const InterleaveGroup *IG; /// Indicates if the interleave group is in a conditional block and requires a @@ -2402,14 +2406,13 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase { /// unusued gaps can be loaded speculatively. bool NeedsMaskForGaps = false; -public: - VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Addr, - ArrayRef StoredValues, VPValue *Mask, - bool NeedsMaskForGaps, DebugLoc DL) - : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, - DL), - - IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) { + VPInterleaveBase(const unsigned char SC, + const InterleaveGroup *IG, + ArrayRef Operands, + ArrayRef StoredValues, VPValue *Mask, + bool NeedsMaskForGaps, DebugLoc DL) + : VPRecipeBase(SC, Operands, DL), IG(IG), + NeedsMaskForGaps(NeedsMaskForGaps) { // TODO: extend the masked interleaved-group support to reversed access. assert((!Mask || !IG->isReverse()) && "Reversed masked interleave-group not supported."); @@ -2427,65 +2430,153 @@ class LLVM_ABI_FOR_TEST VPInterleaveRecipe : public VPRecipeBase { addOperand(Mask); } } - ~VPInterleaveRecipe() override = default; - VPInterleaveRecipe *clone() override { - return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(), - NeedsMaskForGaps, getDebugLoc()); +public: + VPInterleaveBase *clone() override { + llvm_unreachable("cloning not supported"); } - VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPInterleaveSC || + R->getVPDefID() == VPRecipeBase::VPInterleaveEVLSC; + } + + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast(U); + return R && classof(R); + } /// Return the address accessed by this recipe. VPValue *getAddr() const { return getOperand(0); // Address is the 1st, mandatory operand. } + /// Return true if the access needs a mask because of the gaps. + bool needsMaskForGaps() const { return NeedsMaskForGaps; } + /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - // Mask is optional and therefore the last, currently 2nd operand. + // Mask is optional and the last operand. return HasMask ? getOperand(getNumOperands() - 1) : nullptr; } + const InterleaveGroup *getInterleaveGroup() { return IG; } + + Instruction *getInsertPos() const { return IG->getInsertPos(); } + + void execute(VPTransformState &State) override { + llvm_unreachable("VPInterleaveBase should not be instantiated."); + } + + /// Return the cost of this VPInterleaveRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + /// Returns true if the recipe only uses the first lane of operand \p Op. + virtual bool onlyFirstLaneUsed(const VPValue *Op) const = 0; + + /// Returns the number of stored operands of this interleave group. Returns 0 + /// for load interleave groups. + virtual unsigned getNumStoreOperands() const = 0; + /// Return the VPValues stored by this interleave group. If it is a load /// interleave group, return an empty ArrayRef. ArrayRef getStoredValues() const { - // The first operand is the address, followed by the stored values, followed - // by an optional mask. - return ArrayRef(op_begin(), getNumOperands()) - .slice(1, getNumStoreOperands()); + return ArrayRef(op_end() - + (getNumStoreOperands() + (HasMask ? 1 : 0)), + getNumStoreOperands()); + } +}; + +/// VPInterleaveRecipe is a recipe for transforming an interleave group of load +/// or stores into one wide load/store and shuffles. The first operand of a +/// VPInterleave recipe is the address, followed by the stored values, followed +/// by an optional mask. +class LLVM_ABI_FOR_TEST VPInterleaveRecipe final : public VPInterleaveBase { +public: + VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Addr, + ArrayRef StoredValues, VPValue *Mask, + bool NeedsMaskForGaps, DebugLoc DL) + : VPInterleaveBase(VPDef::VPInterleaveSC, IG, ArrayRef({Addr}), + StoredValues, Mask, NeedsMaskForGaps, DL) {} + + ~VPInterleaveRecipe() override = default; + + VPInterleaveRecipe *clone() override { + return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(), + NeedsMaskForGaps, getDebugLoc()); } + VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) + /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; - /// Return the cost of this VPInterleaveRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif - const InterleaveGroup *getInterleaveGroup() { return IG; } + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); + } - /// Returns the number of stored operands of this interleave group. Returns 0 - /// for load interleave groups. - unsigned getNumStoreOperands() const { + unsigned getNumStoreOperands() const override { return getNumOperands() - (HasMask ? 2 : 1); } +}; + +/// A recipe for interleaved access operations with vector-predication +/// intrinsics. The first operand is the address, the second operand is the +/// explicit vector length . Stored values and mask are optional operands. +class LLVM_ABI_FOR_TEST VPInterleaveEVLRecipe final : public VPInterleaveBase { +public: + VPInterleaveEVLRecipe(VPInterleaveRecipe &R, VPValue &EVL, VPValue *Mask) + : VPInterleaveBase(VPDef::VPInterleaveEVLSC, R.getInterleaveGroup(), + ArrayRef({R.getAddr(), &EVL}), + R.getStoredValues(), Mask, R.needsMaskForGaps(), + R.getDebugLoc()) { + assert(!IG->isReverse() && + "Reversed interleave-group with tail folding is not supported."); + assert(!needsMaskForGaps() && "Interleaved access with gap mask is not " + "supported for scalable vector."); + } + + ~VPInterleaveEVLRecipe() override = default; + + VPInterleaveEVLRecipe *clone() override { + llvm_unreachable("cloning not implemented yet"); + } + + VP_CLASSOF_IMPL(VPDef::VPInterleaveEVLSC) + + /// The VPValue of the explicit vector length. + VPValue *getEVL() const { return getOperand(1); } - /// The recipe only uses the first lane of the address. + /// Generate the wide load or store, and shuffles. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// The recipe only uses the first lane of the address, and EVL operand. bool onlyFirstLaneUsed(const VPValue *Op) const override { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); + return (Op == getAddr() && !llvm::is_contained(getStoredValues(), Op)) || + Op == getEVL(); } - Instruction *getInsertPos() const { return IG->getInsertPos(); } + unsigned getNumStoreOperands() const override { + return getNumOperands() - (HasMask ? 3 : 2); + } }; /// A recipe to represent inloop reduction operations, performing a reduction on diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index b39231f106300..59ca851494bf0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -296,7 +296,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { .Case( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) - .Case([V](const VPInterleaveRecipe *R) { + .Case([V](const auto *R) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 23c10d2b25263..9e8c31555fbf9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -53,8 +53,9 @@ bool VPRecipeBase::mayWriteToMemory() const { return cast(this)->mayReadOrWriteMemory(); case VPInstructionSC: return cast(this)->opcodeMayReadOrWriteFromMemory(); + case VPInterleaveEVLSC: case VPInterleaveSC: - return cast(this)->getNumStoreOperands() > 0; + return cast(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: case VPWidenStoreSC: return true; @@ -143,6 +144,7 @@ bool VPRecipeBase::mayReadFromMemory() const { return false; } default: + // FIXME: Return false if the recipe represents an interleaved store. return true; } } @@ -184,6 +186,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { "underlying instruction has side-effects"); return false; } + case VPInterleaveEVLSC: case VPInterleaveSC: return mayWriteToMemory(); case VPWidenLoadEVLSC: @@ -256,7 +259,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { Instruction *UI = nullptr; if (auto *S = dyn_cast(this)) UI = dyn_cast_or_null(S->getUnderlyingValue()); - else if (auto *IG = dyn_cast(this)) + else if (auto *IG = dyn_cast(this)) UI = IG->getInsertPos(); else if (auto *WidenMem = dyn_cast(this)) UI = &WidenMem->getIngredient(); @@ -2133,7 +2136,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { if (VF.isScalar()) return TTI::CastContextHint::Normal; - if (isa(R)) + if (isa(R)) return TTI::CastContextHint::Interleave; if (const auto *ReplicateRecipe = dyn_cast(R)) return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked @@ -3670,8 +3673,155 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +void VPInterleaveEVLRecipe::execute(VPTransformState &State) { + assert(!State.Lane && "Interleave group being replicated."); + assert(State.VF.isScalable() && + "Only support scalable VF for EVL tail-folding."); + assert(!NeedsMaskForGaps && + "Masking gaps for scalable vectors is not yet supported."); + const InterleaveGroup *Group = IG; + Instruction *Instr = Group->getInsertPos(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = getLoadStoreType(Instr); + unsigned InterleaveFactor = Group->getFactor(); + assert(InterleaveFactor <= 8 && + "Unsupported deinterleave/interleave factor for scalable vectors"); + ElementCount WideVF = State.VF * InterleaveFactor; + auto *VecTy = VectorType::get(ScalarTy, WideVF); + + VPValue *BlockInMask = getMask(); + VPValue *Addr = getAddr(); + Value *ResAddr = State.get(Addr, VPLane(0)); + Value *EVL = State.get(getEVL(), VPLane(0)); + + auto CreateGroupMask = [&BlockInMask, &State, + &InterleaveFactor]() -> Value * { + auto *ResBlockInMask = State.get(BlockInMask); + SmallVector Ops(InterleaveFactor, ResBlockInMask); + return interleaveVectors(State.Builder, Ops, "interleaved.mask"); + }; + + Value *GroupMask = nullptr; + if (BlockInMask) + GroupMask = CreateGroupMask(); + else + GroupMask = + State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue()); + + const DataLayout &DL = Instr->getDataLayout(); + // Vectorize the interleaved load group. + if (isa(Instr)) { + CallInst *NewLoad = State.Builder.CreateIntrinsic(VecTy, Intrinsic::vp_load, + {ResAddr, GroupMask, EVL}, + nullptr, "wide.vp.load"); + NewLoad->addParamAttr(0, Attribute::getWithAlignment(NewLoad->getContext(), + Group->getAlign())); + + Group->addMetadata(NewLoad); + + ArrayRef VPDefs = definedValues(); + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + NewLoad = State.Builder.CreateIntrinsic( + Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), + NewLoad->getType(), NewLoad, + /*FMFSource=*/nullptr, "strided.vec"); + + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + + // Skip the gaps in the group. + if (!Member) + continue; + + Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I); + + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); + } + + State.set(VPDefs[J], StridedVec); + ++J; + } + return; + } + + // The sub vector type for current instruction. + auto *SubVT = VectorType::get(ScalarTy, State.VF); + + // Vectorize the interleaved store group. + ArrayRef StoredValues = getStoredValues(); + // Collect the stored vector from each member. + SmallVector StoredVecs; + unsigned StoredIdx = 0; + for (unsigned I = 0; I < InterleaveFactor; I++) { + Instruction *Member = Group->getMember(I); + + // Skip the gaps in the group. + if (!Member) { + Value *Undef = PoisonValue::get(SubVT); + StoredVecs.push_back(Undef); + continue; + } + + Value *StoredVec = State.get(StoredValues[StoredIdx]); + ++StoredIdx; + + // If this member has different type, cast it to a unified type. + if (StoredVec->getType() != SubVT) + StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); + + StoredVecs.push_back(StoredVec); + } + + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); + CallInst *NewStore = State.Builder.CreateIntrinsic( + Type::getVoidTy(EVL->getContext()), Intrinsic::vp_store, + {IVec, ResAddr, GroupMask, EVL}); + NewStore->addParamAttr(1, Attribute::getWithAlignment(NewStore->getContext(), + Group->getAlign())); + + Group->addMetadata(NewStore); +} +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + IG->getInsertPos()->printAsOperand(O, false); + O << ", "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", "; + getEVL()->printAsOperand(O, SlotTracker); + if (VPValue *Mask = getMask()) { + O << ", "; + Mask->printAsOperand(O, SlotTracker); + } + + unsigned OpIdx = 0; + for (unsigned i = 0; i < IG->getFactor(); ++i) { + if (!IG->getMember(i)) + continue; + if (getNumStoreOperands() > 0) { + O << "\n" << Indent << " vp.store "; + getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker); + O << " to index " << i; + } else { + O << "\n" << Indent << " "; + getVPValue(OpIdx)->printAsOperand(O, SlotTracker); + O << " = vp.load from index " << i; + } + ++OpIdx; + } +} +#endif + +InstructionCost VPInterleaveBase::computeCost(ElementCount VF, + VPCostContext &Ctx) const { Instruction *InsertPos = getInsertPos(); // Find the VPValue index of the interleave group. We need to skip gaps. unsigned InsertPosIdx = 0; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f75b2f21b6f1b..1cbf02c613371 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2162,6 +2162,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask, VPValue *NewMask = GetNewMask(S->getMask()); return new VPWidenStoreEVLRecipe(*S, EVL, NewMask); }) + .Case([&](VPInterleaveRecipe *IR) { + VPValue *NewMask = GetNewMask(IR->getMask()); + return new VPInterleaveEVLRecipe(*IR, EVL, NewMask); + }) .Case([&](VPReductionRecipe *Red) { VPValue *NewMask = GetNewMask(Red->getCondOp()); return new VPReductionEVLRecipe(*Red, EVL, NewMask); @@ -2283,16 +2287,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { if (!EVLRecipe) continue; - [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); + unsigned NumDefVal = EVLRecipe->getNumDefinedValues(); assert(NumDefVal == CurRecipe->getNumDefinedValues() && "New recipe must define the same number of values as the " "original."); - assert(NumDefVal <= 1 && - "Only supports recipes with a single definition or without users."); EVLRecipe->insertBefore(CurRecipe); - if (isa(EVLRecipe)) { - VPValue *CurVPV = CurRecipe->getVPSingleValue(); - CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); + if (isa( + EVLRecipe)) { + for (unsigned I = 0; I < NumDefVal; ++I) { + VPValue *CurVPV = CurRecipe->getVPValue(I); + CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I)); + } } ToErase.push_back(CurRecipe); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 24f6d61512ef6..85c6c2c8d7965 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -38,7 +38,7 @@ struct VPDoubleValueDef; class VPSlotTracker; class VPUser; class VPRecipeBase; -class VPInterleaveRecipe; +class VPInterleaveBase; class VPPhiAccessors; // This is the base class of the VPlan Def/Use graph, used for modeling the data @@ -48,7 +48,7 @@ class VPPhiAccessors; class LLVM_ABI_FOR_TEST VPValue { friend class VPDef; friend struct VPDoubleValueDef; - friend class VPInterleaveRecipe; + friend class VPInterleaveBase; friend class VPlan; friend class VPExpressionRecipe; @@ -335,6 +335,7 @@ class VPDef { VPExpressionSC, VPIRInstructionSC, VPInstructionSC, + VPInterleaveEVLSC, VPInterleaveSC, VPReductionEVLSC, VPReductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 3417e1c8dc1ea..4c0dc9bb08046 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { } return VerifyEVLUse(*R, 2); }) - .Case( + .Case( [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); }) .Case( [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); }) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index 4b7c385d34959..323e66b8e62ba 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -401,18 +401,13 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP10]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP16]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP14:%.*]] = icmp ult [[TMP13]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP14]], [[TMP14]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP22]], splat (i1 true), i32 [[TMP10]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i8( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index 378478c00cd5a..06c41b53ddd35 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -13,22 +13,16 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP13:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP14]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP13]], [[TMP13]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP8]], splat (i32 1) ; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 2) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP10]], [[TMP11]]) -; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP13]], [[TMP13]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv8i32.p0( [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv8i32.p0( [[INTERLEAVED_VEC]], ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] @@ -108,22 +102,16 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() -; SCALABLE-NEXT: [[TMP13:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP14:%.*]] = shl i64 [[INDEX]], 1 ; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP14]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP13]], [[TMP13]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; SCALABLE-NEXT: [[TMP10:%.*]] = add [[TMP8]], splat (i32 1) ; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 2) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP10]], [[TMP11]]) -; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP13]], [[TMP13]]) -; SCALABLE-NEXT: call void @llvm.masked.store.nxv8i32.p0( [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i32.p0( [[INTERLEAVED_VEC]], ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] @@ -185,22 +173,16 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP10:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP10]], [[TMP10]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv4i64.p0(ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP20]], splat (i64 1) ; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 2) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP13]], [[TMP11]]) -; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP10]], [[TMP10]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv4i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] @@ -280,22 +262,16 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() -; SCALABLE-NEXT: [[TMP10:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 1 ; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP10]], [[TMP10]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv4i64.p0(ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP20:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; SCALABLE-NEXT: [[TMP13:%.*]] = add [[TMP20]], splat (i64 1) ; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 2) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP13]], [[TMP11]]) -; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP10]], [[TMP10]]) -; SCALABLE-NEXT: call void @llvm.masked.store.nxv4i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP16:%.*]] = zext i32 [[TMP7]] to i64 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] @@ -357,14 +333,9 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP14:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP16]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave3.nxv12i1( [[TMP14]], [[TMP14]], [[TMP14]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv12i32.p0(ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv12i32.p0(ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv12i32( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 1 @@ -373,8 +344,7 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP9]], splat (i32 2) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], splat (i32 3) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv12i32( [[TMP11]], [[TMP12]], [[TMP13]]) -; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave3.nxv12i1( [[TMP14]], [[TMP14]], [[TMP14]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv12i32.p0( [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv12i32.p0( [[INTERLEAVED_VEC]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] @@ -468,14 +438,9 @@ define void @load_store_factor3_i32(ptr %p) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[INDEX]], 3 ; SCALABLE-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP16]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave3.nxv12i1( [[TMP14]], [[TMP14]], [[TMP14]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv12i32.p0(ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv12i32.p0(ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv12i32( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 1 @@ -484,8 +449,7 @@ define void @load_store_factor3_i32(ptr %p) { ; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP9]], splat (i32 2) ; SCALABLE-NEXT: [[TMP13:%.*]] = add [[TMP10]], splat (i32 3) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv12i32( [[TMP11]], [[TMP12]], [[TMP13]]) -; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave3.nxv12i1( [[TMP14]], [[TMP14]], [[TMP14]]) -; SCALABLE-NEXT: call void @llvm.masked.store.nxv12i32.p0( [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 4, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: call void @llvm.vp.store.nxv12i32.p0( [[INTERLEAVED_VEC]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] @@ -558,14 +522,9 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP11:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave3.nxv6i1( [[TMP11]], [[TMP11]], [[TMP11]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv6i64.p0(ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv6i64( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 1 @@ -574,8 +533,7 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP9]], splat (i64 2) ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], splat (i64 3) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv6i64( [[TMP25]], [[TMP12]], [[TMP13]]) -; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave3.nxv6i1( [[TMP11]], [[TMP11]], [[TMP11]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] @@ -669,14 +627,9 @@ define void @load_store_factor3_i64(ptr %p) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() -; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 3 ; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave3.nxv6i1( [[TMP11]], [[TMP11]], [[TMP11]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv6i64.p0(ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , } @llvm.vector.deinterleave3.nxv6i64( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP23:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , } [[STRIDED_VEC]], 1 @@ -685,8 +638,7 @@ define void @load_store_factor3_i64(ptr %p) { ; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP9]], splat (i64 2) ; SCALABLE-NEXT: [[TMP13:%.*]] = add [[TMP10]], splat (i64 3) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv6i64( [[TMP25]], [[TMP12]], [[TMP13]]) -; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave3.nxv6i1( [[TMP11]], [[TMP11]], [[TMP11]]) -; SCALABLE-NEXT: call void @llvm.masked.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP14]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: call void @llvm.vp.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] @@ -759,14 +711,9 @@ define void @load_store_factor4(ptr %p) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP10:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv8i1( [[TMP10]], [[TMP10]], [[TMP10]], [[TMP10]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i64.p0(ptr [[TMP9]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv8i64( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 @@ -777,8 +724,7 @@ define void @load_store_factor4(ptr %p) { ; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP12]], splat (i64 3) ; CHECK-NEXT: [[TMP17:%.*]] = add [[TMP13]], splat (i64 4) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv8i64( [[TMP26]], [[TMP15]], [[TMP16]], [[TMP17]]) -; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave4.nxv8i1( [[TMP10]], [[TMP10]], [[TMP10]], [[TMP10]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP9]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] @@ -884,14 +830,9 @@ define void @load_store_factor4(ptr %p) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() -; SCALABLE-NEXT: [[TMP10:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[INDEX]], 4 ; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv8i1( [[TMP10]], [[TMP10]], [[TMP10]], [[TMP10]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i64.p0(ptr [[TMP9]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv8i64( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP24:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 @@ -902,8 +843,7 @@ define void @load_store_factor4(ptr %p) { ; SCALABLE-NEXT: [[TMP16:%.*]] = add [[TMP12]], splat (i64 3) ; SCALABLE-NEXT: [[TMP17:%.*]] = add [[TMP13]], splat (i64 4) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv8i64( [[TMP26]], [[TMP15]], [[TMP16]], [[TMP17]]) -; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave4.nxv8i1( [[TMP10]], [[TMP10]], [[TMP10]], [[TMP10]]) -; SCALABLE-NEXT: call void @llvm.masked.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP9]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP22]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] @@ -987,14 +927,9 @@ define void @load_store_factor5(ptr %p) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv1i32() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult [[TMP18]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave5.nxv5i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv5i64.p0(ptr [[TMP19]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv5i64.p0(ptr align 8 [[TMP19]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , } @llvm.vector.deinterleave5.nxv5i64( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 1 @@ -1007,8 +942,7 @@ define void @load_store_factor5(ptr %p) { ; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP11]], splat (i64 4) ; CHECK-NEXT: [[TMP17:%.*]] = add [[TMP12]], splat (i64 5) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave5.nxv5i64( [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) -; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave5.nxv5i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv5i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv5i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP19]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP25]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]] @@ -1128,14 +1062,9 @@ define void @load_store_factor5(ptr %p) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv1i32() -; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult [[TMP18]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 5 ; SCALABLE-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave5.nxv5i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv5i64.p0(ptr [[TMP19]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv5i64.p0(ptr align 8 [[TMP19]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , } @llvm.vector.deinterleave5.nxv5i64( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , } [[STRIDED_VEC]], 1 @@ -1148,8 +1077,7 @@ define void @load_store_factor5(ptr %p) { ; SCALABLE-NEXT: [[TMP16:%.*]] = add [[TMP11]], splat (i64 4) ; SCALABLE-NEXT: [[TMP17:%.*]] = add [[TMP12]], splat (i64 5) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave5.nxv5i64( [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]]) -; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave5.nxv5i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; SCALABLE-NEXT: call void @llvm.masked.store.nxv5i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP19]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: call void @llvm.vp.store.nxv5i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP19]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP25:%.*]] = zext i32 [[TMP7]] to i64 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP25]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]] @@ -1244,14 +1172,9 @@ define void @load_store_factor6(ptr %p) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.stepvector.nxv1i32() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult [[TMP20]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave6.nxv6i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv6i64.p0(ptr [[TMP21]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP21]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , } @llvm.vector.deinterleave6.nxv6i64( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 1 @@ -1266,8 +1189,7 @@ define void @load_store_factor6(ptr %p) { ; CHECK-NEXT: [[TMP18:%.*]] = add [[TMP12]], splat (i64 5) ; CHECK-NEXT: [[TMP19:%.*]] = add [[TMP13]], splat (i64 6) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave6.nxv6i64( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]]) -; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave6.nxv6i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP21]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP21]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP28:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP28]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]] @@ -1400,14 +1322,9 @@ define void @load_store_factor6(ptr %p) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP20:%.*]] = call @llvm.stepvector.nxv1i32() -; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult [[TMP20]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 6 ; SCALABLE-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave6.nxv6i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv6i64.p0(ptr [[TMP21]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv6i64.p0(ptr align 8 [[TMP21]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , } @llvm.vector.deinterleave6.nxv6i64( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , } [[STRIDED_VEC]], 1 @@ -1422,8 +1339,7 @@ define void @load_store_factor6(ptr %p) { ; SCALABLE-NEXT: [[TMP18:%.*]] = add [[TMP12]], splat (i64 5) ; SCALABLE-NEXT: [[TMP19:%.*]] = add [[TMP13]], splat (i64 6) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave6.nxv6i64( [[TMP14]], [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]]) -; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave6.nxv6i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; SCALABLE-NEXT: call void @llvm.masked.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP21]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: call void @llvm.vp.store.nxv6i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP21]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP28:%.*]] = zext i32 [[TMP7]] to i64 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP28]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]] @@ -1529,14 +1445,9 @@ define void @load_store_factor7(ptr %p) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = call @llvm.stepvector.nxv1i32() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult [[TMP22]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave7.nxv7i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv7i64.p0(ptr [[TMP23]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv7i64.p0(ptr align 8 [[TMP23]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , } @llvm.vector.deinterleave7.nxv7i64( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 1 @@ -1553,8 +1464,7 @@ define void @load_store_factor7(ptr %p) { ; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP13]], splat (i64 6) ; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP14]], splat (i64 7) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave7.nxv7i64( [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]], [[TMP21]]) -; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave7.nxv7i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv7i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP23]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv7i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP23]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP31:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP31]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP31]] @@ -1701,14 +1611,9 @@ define void @load_store_factor7(ptr %p) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP22:%.*]] = call @llvm.stepvector.nxv1i32() -; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult [[TMP22]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 [[INDEX]], 7 ; SCALABLE-NEXT: [[TMP23:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave7.nxv7i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv7i64.p0(ptr [[TMP23]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv7i64.p0(ptr align 8 [[TMP23]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , } @llvm.vector.deinterleave7.nxv7i64( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , , } [[STRIDED_VEC]], 1 @@ -1725,8 +1630,7 @@ define void @load_store_factor7(ptr %p) { ; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP13]], splat (i64 6) ; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP14]], splat (i64 7) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave7.nxv7i64( [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]], [[TMP21]]) -; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave7.nxv7i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; SCALABLE-NEXT: call void @llvm.masked.store.nxv7i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP23]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: call void @llvm.vp.store.nxv7i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP23]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP31:%.*]] = zext i32 [[TMP7]] to i64 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP31]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP31]] @@ -1843,14 +1747,9 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv1i32() -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult [[TMP4]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave8.nxv8i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i64.p0(ptr [[TMP24]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP24]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , , } @llvm.vector.deinterleave8.nxv8i64( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 1 @@ -1869,8 +1768,7 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 7) ; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 8) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave8.nxv8i64( [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]], [[TMP21]], [[TMP22]], [[TMP23]]) -; CHECK-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave8.nxv8i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP24]], i32 8, [[INTERLEAVED_MASK1]]) +; CHECK-NEXT: call void @llvm.vp.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP24]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP34]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP34]] @@ -2028,14 +1926,9 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 1, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv1i32() -; SCALABLE-NEXT: [[TMP5:%.*]] = icmp ult [[TMP4]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 3 ; SCALABLE-NEXT: [[TMP24:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave8.nxv8i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i64.p0(ptr [[TMP24]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv8i64.p0(ptr align 8 [[TMP24]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , , , , , , , } @llvm.vector.deinterleave8.nxv8i64( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , , , , , , , } [[STRIDED_VEC]], 1 @@ -2054,8 +1947,7 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 7) ; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 8) ; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave8.nxv8i64( [[TMP16]], [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]], [[TMP21]], [[TMP22]], [[TMP23]]) -; SCALABLE-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave8.nxv8i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; SCALABLE-NEXT: call void @llvm.masked.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr [[TMP24]], i32 8, [[INTERLEAVED_MASK1]]) +; SCALABLE-NEXT: call void @llvm.vp.store.nxv8i64.p0( [[INTERLEAVED_VEC]], ptr align 8 [[TMP24]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP34:%.*]] = zext i32 [[TMP7]] to i64 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP34]], [[INDEX]] ; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP34]] @@ -2183,14 +2075,9 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP12:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP13]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP12]], [[TMP12]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 @@ -2272,14 +2159,9 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1 ; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP13]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP12]], [[TMP12]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP15]], i32 4, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 @@ -2347,14 +2229,9 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP12:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP13]] -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP12]], [[TMP12]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv4i64.p0(ptr [[TMP15]], i32 8, [[INTERLEAVED_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP15]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_MASKED_VEC]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 @@ -2436,14 +2313,9 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i32() -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp ult [[TMP6]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP13:%.*]] = shl i64 [[INDEX]], 1 ; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP13]] -; SCALABLE-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv4i1( [[TMP12]], [[TMP12]]) -; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv4i64.p0(ptr [[TMP15]], i32 8, [[INTERLEAVED_MASK]], poison) +; SCALABLE-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.vp.load.nxv4i64.p0(ptr align 8 [[TMP15]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_MASKED_VEC]]) ; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll index 1bceb871bd999..f581442112eea 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll @@ -113,34 +113,29 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = phi i32 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv16i32() -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult [[TMP2]], [[BROADCAST_SPLAT4]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = select [[TMP3]], [[TMP4]], zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = shl i32 [[EVL_BASED_IV]], 1 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP5]], [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP8]], i32 1, [[INTERLEAVED_MASK]], poison) -; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = call @llvm.smax.nxv16i8( [[TMP9]], [[TMP10]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = sext i32 [[TMP6]] to i64 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP11]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP11]], [[TMP14]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK5:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP5]], [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP13]], i32 1, [[INTERLEAVED_MASK5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl i32 [[EVL_BASED_IV]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP2]], [[TMP2]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call @llvm.vp.load.nxv32i8.p0(ptr align 1 [[TMP5]], [[INTERLEAVED_MASK]], i32 [[TMP1]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_VP_LOAD]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = call @llvm.smax.nxv16i8( [[TMP6]], [[TMP7]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = sext i32 [[TMP3]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP9]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = sub zeroinitializer, [[TMP8]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP2]], [[TMP2]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP8]], [[TMP11]]) +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr align 1 [[TMP10]], [[INTERLEAVED_MASK3]], i32 [[TMP1]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 -; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA-WITH-EVL: scalar.ph: @@ -296,38 +291,33 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL:%.*]] = phi i32 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv16i32() -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult [[TMP2]], [[BROADCAST_SPLAT4]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = select [[TMP3]], [[TMP4]], zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = shl i32 [[EVL_BASED_IV]], 2 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP8]], i32 1, [[INTERLEAVED_MASK]], poison) -; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[TMP9]], [[TMP10]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP15]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP17:%.*]] = sext i32 [[TMP6]] to i64 -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP17]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]]) -; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK5:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) -; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK5]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = shl i32 [[EVL_BASED_IV]], 2 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP4]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP2]], [[TMP2]], [[TMP2]], [[TMP2]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[WIDE_VP_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr align 1 [[TMP5]], [[INTERLEAVED_MASK]], i32 [[TMP1]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_VP_LOAD]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP6:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP8:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP10:%.*]] = call @llvm.smax.nxv16i8( [[TMP6]], [[TMP7]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP11:%.*]] = sub zeroinitializer, [[TMP10]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[TMP8]], [[TMP9]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP13:%.*]] = sub zeroinitializer, [[TMP12]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP14:%.*]] = sext i32 [[TMP3]] to i64 +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP2]], [[TMP2]], [[TMP2]], [[TMP2]]) +; PREDICATED_DATA-WITH-EVL-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP10]], [[TMP11]], [[TMP12]], [[TMP13]]) +; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.vp.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr align 1 [[TMP15]], [[INTERLEAVED_MASK3]], i32 [[TMP1]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 -; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 +; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: ; PREDICATED_DATA-WITH-EVL-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_DATA-WITH-EVL: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index c3c88372f9dcf..78ca6593aeac4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -7,7 +7,6 @@ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \ ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s -; FIXME: interleaved accesses are not supported yet with predicated vectorization. define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-LABEL: @interleave( ; IF-EVL-NEXT: entry: @@ -17,25 +16,20 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.stepvector.nxv4i32() -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ult [[TMP16]], [[BROADCAST_SPLAT]] +; IF-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[EVL_BASED_IV]], i32 0 -; IF-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv8i1( [[TMP17]], [[TMP17]]) -; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call @llvm.masked.load.nxv8i32.p0(ptr [[TMP6]], i32 4, [[INTERLEAVED_MASK]], poison) +; IF-EVL-NEXT: [[WIDE_VEC:%.*]] = call @llvm.vp.load.nxv8i32.p0(ptr align 4 [[TMP6]], splat (i1 true), i32 [[TMP16]]) ; IF-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; IF-EVL-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; IF-EVL-NEXT: [[TMP9:%.*]] = add nsw [[TMP15]], [[TMP14]] ; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP9]], ptr align 4 [[TMP10]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]] -; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP9]], ptr align 4 [[TMP10]], splat (i1 true), i32 [[TMP16]]) +; IF-EVL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP16]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] +; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; IF-EVL: scalar.ph: