diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 3d6fc309c7cf4..7c0525b9c9957 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -97,7 +97,40 @@ auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { return enum_seq(LOAD_CNT, MaxCounter); } -using RegInterval = std::pair; +/// Integer IDs used to track vector memory locations we may have to wait on. +/// Encoded as u16 chunks: +/// +/// [0, REGUNITS_END ): MCRegUnit +/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs +/// +/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary. +/// It gives (2 << 16) - 1 entries per category which is more than enough +/// for all register units. MCPhysReg is u16 so we don't even support >u16 +/// physical register numbers at this time, let alone >u16 register units. +/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END +/// is enough for all register units. +using VMEMID = uint32_t; + +enum : VMEMID { + TRACKINGID_RANGE_LEN = (1 << 16), + + // Important: MCRegUnits must always be tracked starting from 0, as we + // need to be able to convert between a MCRegUnit and a VMEMID freely. + REGUNITS_BEGIN = 0, + REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN, + + // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common" + // entry, which is updated for all LDS DMA operations encountered. + // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1. + NUM_LDSDMA = TRACKINGID_RANGE_LEN, + LDSDMA_BEGIN = REGUNITS_END, + LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA, +}; + +/// Convert a MCRegUnit to a VMEMID. +static constexpr VMEMID toVMEMID(MCRegUnit RU) { + return static_cast(RU); +} struct HardwareLimits { unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. @@ -146,30 +179,6 @@ static constexpr StringLiteral WaitEventTypeName[] = { #undef AMDGPU_EVENT_NAME // clang-format on -// The mapping is: -// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs -// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots -// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs -// NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS .. SCC -// We reserve a fixed number of VGPR slots in the scoring tables for -// special tokens like SCMEM_LDS (needed for buffer load to LDS). -enum RegisterMapping { - SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets. - AGPR_OFFSET = 512, // Maximum programmable ArchVGPRs across all targets. - SQ_MAX_PGM_SGPRS = 128, // Maximum programmable SGPRs across all targets. - // Artificial register slots to track LDS writes into specific LDS locations - // if a location is known. When slots are exhausted or location is - // unknown use the first slot. The first slot is also always updated in - // addition to known location's slot to properly generate waits if dependent - // instruction's location is unknown. - FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores. - NUM_LDS_VGPRS = 9, // One more than the stores we track. - NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start. - NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS, - // Remaining non-allocatable registers - SCC = NUM_ALL_ALLOCATABLE -}; - // Enumerate different types of result-returning VMEM operations. Although // s_waitcnt orders them all with a single vmcnt counter, in the absence of // s_waitcnt only instructions of the same VmemType are guaranteed to write @@ -585,7 +594,30 @@ class SIInsertWaitcnts { // "s_waitcnt 0" before use. class WaitcntBrackets { public: - WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {} + WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) { + assert(Context->TRI->getNumRegUnits() < REGUNITS_END); + } + +#ifndef NDEBUG + ~WaitcntBrackets() { + unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0; + for (auto &[ID, Val] : VMem) { + if (Val.empty()) + ++NumUnusedVmem; + } + for (auto &[ID, Val] : SGPRs) { + if (Val.empty()) + ++NumUnusedSGPRs; + } + + if (NumUnusedVmem || NumUnusedSGPRs) { + errs() << "WaitcntBracket had unused entries at destruction time: " + << NumUnusedVmem << " VMem and " << NumUnusedSGPRs + << " SGPR unused entries\n"; + std::abort(); + } + } +#endif bool isSmemCounter(InstCounterType T) const { return T == Context->SmemAccessCounter || T == X_CNT; @@ -610,22 +642,18 @@ class WaitcntBrackets { return getScoreUB(T) - getScoreLB(T); } - unsigned getRegScore(int GprNo, InstCounterType T) const { - if (GprNo < NUM_ALL_VGPRS) - return VgprScores[T][GprNo]; - - if (GprNo < NUM_ALL_ALLOCATABLE) - return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; + unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const { + auto It = SGPRs.find(RU); + return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0; + } - assert(GprNo == SCC); - return SCCScore; + unsigned getVMemScore(VMEMID TID, InstCounterType T) const { + auto It = VMem.find(TID); + return It != VMem.end() ? It->second.Scores[T] : 0; } bool merge(const WaitcntBrackets &Other); - RegInterval getRegInterval(const MachineInstr *MI, - const MachineOperand &Op) const; - bool counterOutOfOrder(InstCounterType T) const; void simplifyWaitcnt(AMDGPU::Waitcnt &Wait); void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; @@ -633,12 +661,10 @@ class WaitcntBrackets { bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait); void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait); - void determineWait(InstCounterType T, RegInterval Interval, - AMDGPU::Waitcnt &Wait) const; - void determineWait(InstCounterType T, int RegNo, - AMDGPU::Waitcnt &Wait) const { - determineWait(T, {RegNo, RegNo + 1}, Wait); - } + void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg, + AMDGPU::Waitcnt &Wait) const; + void determineWaitForLDSDMA(InstCounterType T, VMEMID TID, + AMDGPU::Waitcnt &Wait) const; void tryClearSCCWriteEvent(MachineInstr *Inst); void applyWaitcnt(const AMDGPU::Waitcnt &Wait); @@ -686,19 +712,22 @@ class WaitcntBrackets { // Return true if there might be pending writes to the vgpr-interval by VMEM // instructions with types different from V. - bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - if (VgprVmemTypes[RegNo] & ~(1 << V)) + bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const { + for (MCRegUnit RU : regunits(Reg)) { + auto It = VMem.find(toVMEMID(RU)); + if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V))) return true; } return false; } - void clearVgprVmemTypes(RegInterval Interval) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - VgprVmemTypes[RegNo] = 0; + void clearVgprVmemTypes(MCPhysReg Reg) { + for (MCRegUnit RU : regunits(Reg)) { + if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) { + It->second.VMEMTypes = 0; + if (It->second.empty()) + VMem.erase(It); + } } } @@ -714,11 +743,15 @@ class WaitcntBrackets { bool hasPointSampleAccel(const MachineInstr &MI) const; bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, - RegInterval Interval) const; + MCPhysReg RU) const; void print(raw_ostream &) const; void dump() const { print(dbgs()); } + // Free up memory by removing empty entries from the DenseMap that track event + // scores. + void purgeEmptyTrackingData(); + private: struct MergeInfo { unsigned OldLB; @@ -726,9 +759,24 @@ class WaitcntBrackets { unsigned MyShift; unsigned OtherShift; }; + + void determineWaitForScore(InstCounterType T, unsigned Score, + AMDGPU::Waitcnt &Wait) const; + static bool mergeScore(const MergeInfo &M, unsigned &Score, unsigned OtherScore); + iterator_range regunits(MCPhysReg Reg) const { + assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC"); + if (!Context->TRI->isInAllocatableClass(Reg)) + return {{}, {}}; + const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg); + unsigned Size = Context->TRI->getRegSizeInBits(*RC); + if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) + Reg = Context->TRI->get32BitRegister(Reg); + return Context->TRI->regunits(Reg); + } + void setScoreLB(InstCounterType T, unsigned Val) { assert(T < NUM_INST_CNTS); ScoreLBs[T] = Val; @@ -745,15 +793,28 @@ class WaitcntBrackets { ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); } - void setRegScore(int GprNo, InstCounterType T, unsigned Val) { - setScoreByInterval({GprNo, GprNo + 1}, T, Val); + void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) { + const SIRegisterInfo *TRI = Context->TRI; + if (Reg == AMDGPU::SCC) { + SCCScore = Val; + } else if (TRI->isVectorRegister(*Context->MRI, Reg)) { + for (MCRegUnit RU : regunits(Reg)) + VMem[toVMEMID(RU)].Scores[T] = Val; + } else if (TRI->isSGPRReg(*Context->MRI, Reg)) { + auto STy = getSgprScoresIdx(T); + for (MCRegUnit RU : regunits(Reg)) + SGPRs[RU].Scores[STy] = Val; + } else { + llvm_unreachable("Register cannot be tracked/unknown register!"); + } } - void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, - unsigned Score); + void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) { + VMem[TID].Scores[T] = Val; + } - void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op, - InstCounterType CntTy, unsigned Val); + void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy, + unsigned Val); const SIInsertWaitcnts *Context; @@ -764,26 +825,52 @@ class WaitcntBrackets { unsigned LastFlat[NUM_INST_CNTS] = {0}; // Remember the last GDS operation. unsigned LastGDS = 0; - // wait_cnt scores for every vgpr. - // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int VgprUB = -1; - int SgprUB = -1; - unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt - // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. - // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the - // X_CNT score. - unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; + + // The score tracking logic is fragmented as follows: + // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding. + // - SGPRs: SGPR RegUnits + // - SCC: Non-allocatable and not general purpose: not a SGPR. + // + // For the VMem case, if the key is within the range of LDS DMA IDs, + // then the corresponding index into the `LDSDMAStores` vector below is: + // Key - LDSDMA_BEGIN - 1 + // This is because LDSDMA_BEGIN is a generic entry and does not have an + // associated MachineInstr. + // + // TODO: Could we track SCC alongside SGPRs so it's not longer a special case? + + struct VMEMInfo { + // Scores for all instruction counters. + std::array Scores = {0}; + // Bitmask of the VmemTypes of VMEM instructions for this VGPR. + unsigned VMEMTypes = 0; + + bool empty() const { + return all_of(Scores, [](unsigned K) { return K == 0; }) && !VMEMTypes; + } + }; + + struct SGPRInfo { + // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. + // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps + // the X_CNT score. + std::array Scores = {0}; + + bool empty() const { return !Scores[0] && !Scores[1]; } + }; + + DenseMap VMem; // VGPR + LDS DMA + DenseMap SGPRs; + // Reg score for SCC. unsigned SCCScore = 0; // The unique instruction that has an SCC write pending, if there is one. const MachineInstr *PendingSCCWrite = nullptr; - // Bitmask of the VmemTypes of VMEM instructions that might have a pending - // write to each vgpr. - unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; + // Store representative LDS DMA operations. The only useful info here is // alias info. One store is kept per unique AAInfo. - SmallVector LDSDMAStores; + SmallVector LDSDMAStores; }; class SIInsertWaitcntsLegacy : public MachineFunctionPass { @@ -809,82 +896,9 @@ class SIInsertWaitcntsLegacy : public MachineFunctionPass { } // end anonymous namespace -RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, - const MachineOperand &Op) const { - if (Op.getReg() == AMDGPU::SCC) - return {SCC, SCC + 1}; - - const SIRegisterInfo *TRI = Context->TRI; - const MachineRegisterInfo *MRI = Context->MRI; - - if (!TRI->isInAllocatableClass(Op.getReg())) - return {-1, -1}; - - // A use via a PW operand does not need a waitcnt. - // A partial write is not a WAW. - assert(!Op.getSubReg() || !Op.isUndef()); - - RegInterval Result; - - MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST); - unsigned RegIdx = TRI->getHWRegIndex(MCReg); - - const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); - unsigned Size = TRI->getRegSizeInBits(*RC); - - // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits - if (TRI->isVectorRegister(*MRI, Op.getReg())) { - unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0); - assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET); - Result.first = Reg; - if (TRI->isAGPR(*MRI, Op.getReg())) - Result.first += AGPR_OFFSET; - assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); - assert(Size % 16 == 0); - Result.second = Result.first + (Size / 16); - - if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) { - // Regardless of which lo16/hi16 is used, consider the full 32-bit - // register used. - if (AMDGPU::isHi16Reg(MCReg, *TRI)) - Result.first -= 1; - else - Result.second += 1; - } - } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) { - // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar - // sources like SRC_PRIVATE_BASE. - Result.first = RegIdx + NUM_ALL_VGPRS; - Result.second = Result.first + divideCeil(Size, 32); - } else { - return {-1, -1}; - } - - return Result; -} - -void WaitcntBrackets::setScoreByInterval(RegInterval Interval, - InstCounterType CntTy, - unsigned Score) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (RegNo < NUM_ALL_VGPRS) { - VgprUB = std::max(VgprUB, RegNo); - VgprScores[CntTy][RegNo] = Score; - } else if (RegNo < NUM_ALL_ALLOCATABLE) { - SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS); - SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score; - } else { - assert(RegNo == SCC); - SCCScore = Score; - } - } -} - -void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, - const MachineOperand &Op, +void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy, unsigned Score) { - RegInterval Interval = getRegInterval(MI, Op); - setScoreByInterval(Interval, CntTy, Score); + setRegScore(Op.getReg().asMCReg(), CntTy, Score); } // Return true if the subtarget is one that enables Point Sample Acceleration @@ -907,12 +921,12 @@ bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const { // one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER // (this is the type that a point sample accelerated instruction effectively // becomes) -bool WaitcntBrackets::hasPointSamplePendingVmemTypes( - const MachineInstr &MI, RegInterval Interval) const { +bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI, + MCPhysReg Reg) const { if (!hasPointSampleAccel(MI)) return false; - return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER); + return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER); } void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { @@ -940,57 +954,52 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // All GDS operations must protect their address register (same as // export.) if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr)) - setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore); + setScoreByOperand(*AddrOp, EXP_CNT, CurrScore); if (Inst.mayStore()) { if (const auto *Data0 = TII->getNamedOperand(Inst, AMDGPU::OpName::data0)) - setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore); + setScoreByOperand(*Data0, EXP_CNT, CurrScore); if (const auto *Data1 = TII->getNamedOperand(Inst, AMDGPU::OpName::data1)) - setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore); + setScoreByOperand(*Data1, EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) && Inst.getOpcode() != AMDGPU::DS_APPEND && Inst.getOpcode() != AMDGPU::DS_CONSUME && Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { for (const MachineOperand &Op : Inst.all_uses()) { if (TRI->isVectorRegister(*MRI, Op.getReg())) - setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore); + setScoreByOperand(Op, EXP_CNT, CurrScore); } } } else if (TII->isFLAT(Inst)) { if (Inst.mayStore()) { - setScoreByOperand(&Inst, - *TII->getNamedOperand(Inst, AMDGPU::OpName::data), + setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst)) { - setScoreByOperand(&Inst, - *TII->getNamedOperand(Inst, AMDGPU::OpName::data), + setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } } else if (TII->isMIMG(Inst)) { if (Inst.mayStore()) { - setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore); + setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst)) { - setScoreByOperand(&Inst, - *TII->getNamedOperand(Inst, AMDGPU::OpName::data), + setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } } else if (TII->isMTBUF(Inst)) { if (Inst.mayStore()) - setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore); + setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore); } else if (TII->isMUBUF(Inst)) { if (Inst.mayStore()) { - setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore); + setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore); } else if (SIInstrInfo::isAtomicRet(Inst)) { - setScoreByOperand(&Inst, - *TII->getNamedOperand(Inst, AMDGPU::OpName::data), + setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data), EXP_CNT, CurrScore); } } else if (TII->isLDSDIR(Inst)) { // LDSDIR instructions attach the score to the destination. - setScoreByOperand(&Inst, - *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst), + setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst), EXP_CNT, CurrScore); } else { if (TII->isEXP(Inst)) { @@ -1000,13 +1009,13 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // score. for (MachineOperand &DefMO : Inst.all_defs()) { if (TRI->isVGPR(*MRI, DefMO.getReg())) { - setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore); + setScoreByOperand(DefMO, EXP_CNT, CurrScore); } } } for (const MachineOperand &Op : Inst.all_uses()) { if (TRI->isVectorRegister(*MRI, Op.getReg())) - setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore); + setScoreByOperand(Op, EXP_CNT, CurrScore); } } } else if (T == X_CNT) { @@ -1020,7 +1029,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { PendingEvents &= ~(1 << OtherEvent); } for (const MachineOperand &Op : Inst.all_uses()) - setScoreByOperand(&Inst, Op, T, CurrScore); + setScoreByOperand(Op, T, CurrScore); } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { // Match the score to the destination registers. // @@ -1032,9 +1041,8 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // Special cases where implicit register defs exists, such as M0 or VCC, // but none with memory instructions. for (const MachineOperand &Op : Inst.defs()) { - RegInterval Interval = getRegInterval(&Inst, Op); if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) { - if (Interval.first >= NUM_ALL_VGPRS) + if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper continue; if (updateVMCntOnly(Inst)) { // updateVMCntOnly should only leave us with VGPRs @@ -1047,16 +1055,20 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // this with another potential dependency if (hasPointSampleAccel(Inst)) TypesMask |= 1 << VMEM_NOSAMPLER; - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) - VgprVmemTypes[RegNo] |= TypesMask; + for (MCRegUnit RU : regunits(Op.getReg().asMCReg())) + VMem[toVMEMID(RU)].VMEMTypes |= TypesMask; } } - setScoreByInterval(Interval, T, CurrScore); + setScoreByOperand(Op, T, CurrScore); } if (Inst.mayStore() && (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS // written can be accessed. A load from LDS to VMEM does not need a wait. + // + // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then + // there is a MachineInstr in LDSDMAStores used to track this LDSDMA + // store. The "Slot" is the index into LDSDMAStores + 1. unsigned Slot = 0; for (const auto *MemOp : Inst.memoperands()) { if (!MemOp->isStore() || @@ -1069,9 +1081,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // original memory object and practically produced in the module LDS // lowering pass. If there is no scope available we will not be able // to disambiguate LDS aliasing as after the module lowering all LDS - // is squashed into a single big object. Do not attempt to use one of - // the limited LDSDMAStores for something we will not be able to use - // anyway. + // is squashed into a single big object. if (!AAI || !AAI.Scope) break; for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) { @@ -1084,21 +1094,20 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { } if (Slot) break; - // The slot may not be valid because it can be >= NUM_LDS_VGPRS which + // The slot may not be valid because it can be >= NUM_LDSDMA which // means the scoreboard cannot track it. We still want to preserve the // MI in order to check alias information, though. LDSDMAStores.push_back(&Inst); Slot = LDSDMAStores.size(); break; } - if (Slot < NUM_LDS_VGPRS) - setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore); - if (Slot) - setRegScore(FIRST_LDS_VGPR, T, CurrScore); + setVMemScore(LDSDMA_BEGIN, T, CurrScore); + if (Slot && Slot < NUM_LDSDMA) + setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore); } if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { - setRegScore(SCC, T, CurrScore); + setRegScore(AMDGPU::SCC, T, CurrScore); PendingSCCWrite = &Inst; } } @@ -1148,27 +1157,36 @@ void WaitcntBrackets::print(raw_ostream &OS) const { // Print vgpr scores. unsigned LB = getScoreLB(T); - for (int J = 0; J <= VgprUB; J++) { - unsigned RegScore = getRegScore(J, T); + SmallVector SortedVMEMIDs(VMem.keys()); + sort(SortedVMEMIDs); + + for (auto ID : SortedVMEMIDs) { + unsigned RegScore = VMem.at(ID).Scores[T]; if (RegScore <= LB) continue; unsigned RelScore = RegScore - LB - 1; - if (J < FIRST_LDS_VGPR) { - OS << ' ' << RelScore << ":v" << J; + if (ID < REGUNITS_END) { + OS << ' ' << RelScore << ":vRU" << ID; } else { - OS << ' ' << RelScore << ":ds"; + assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END && + "Unhandled/unexpected ID value!"); + OS << ' ' << RelScore << ":LDSDMA" << ID; } } + // Also need to print sgpr scores for lgkm_cnt or xcnt. if (isSmemCounter(T)) { - for (int J = 0; J <= SgprUB; J++) { - unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T); + SmallVector SortedSMEMIDs(SGPRs.keys()); + sort(SortedSMEMIDs); + for (auto ID : SortedSMEMIDs) { + unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)]; if (RegScore <= LB) continue; unsigned RelScore = RegScore - LB - 1; - OS << ' ' << RelScore << ":s" << J; + OS << ' ' << RelScore << ":sRU" << static_cast(ID); } } + if (T == KM_CNT && SCCScore > 0) OS << ' ' << SCCScore << ":scc"; } @@ -1213,38 +1231,65 @@ void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, Count = ~0u; } -void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, - AMDGPU::Waitcnt &Wait) const { +void WaitcntBrackets::purgeEmptyTrackingData() { + for (auto &[K, V] : make_early_inc_range(VMem)) { + if (V.empty()) + VMem.erase(K); + } + for (auto &[K, V] : make_early_inc_range(SGPRs)) { + if (V.empty()) + SGPRs.erase(K); + } +} + +void WaitcntBrackets::determineWaitForScore(InstCounterType T, + unsigned ScoreToWait, + AMDGPU::Waitcnt &Wait) const { const unsigned LB = getScoreLB(T); const unsigned UB = getScoreUB(T); - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - unsigned ScoreToWait = getRegScore(RegNo, T); - - // If the score of src_operand falls within the bracket, we need an - // s_waitcnt instruction. - if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { - if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && - !Context->ST->hasFlatLgkmVMemCountInOrder()) { - // If there is a pending FLAT operation, and this is a VMem or LGKM - // waitcnt and the target can report early completion, then we need - // to force a waitcnt 0. - addWait(Wait, T, 0); - } else if (counterOutOfOrder(T)) { - // Counter can get decremented out-of-order when there - // are multiple types event in the bracket. Also emit an s_wait counter - // with a conservative value of 0 for the counter. - addWait(Wait, T, 0); - } else { - // If a counter has been maxed out avoid overflow by waiting for - // MAX(CounterType) - 1 instead. - unsigned NeededWait = - std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); - addWait(Wait, T, NeededWait); - } + + // If the score falls within the bracket, we need a waitcnt. + if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { + if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && + !Context->ST->hasFlatLgkmVMemCountInOrder()) { + // If there is a pending FLAT operation, and this is a VMem or LGKM + // waitcnt and the target can report early completion, then we need + // to force a waitcnt 0. + addWait(Wait, T, 0); + } else if (counterOutOfOrder(T)) { + // Counter can get decremented out-of-order when there + // are multiple types event in the bracket. Also emit an s_wait counter + // with a conservative value of 0 for the counter. + addWait(Wait, T, 0); + } else { + // If a counter has been maxed out avoid overflow by waiting for + // MAX(CounterType) - 1 instead. + unsigned NeededWait = + std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); + addWait(Wait, T, NeededWait); } } } +void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg, + AMDGPU::Waitcnt &Wait) const { + if (Reg == AMDGPU::SCC) { + determineWaitForScore(T, SCCScore, Wait); + } else { + bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg); + for (MCRegUnit RU : regunits(Reg)) + determineWaitForScore( + T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T), + Wait); + } +} + +void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID, + AMDGPU::Waitcnt &Wait) const { + assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END); + determineWaitForScore(T, getVMemScore(TID, T), Wait); +} + void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) { // S_BARRIER_WAIT on the same barrier guarantees that the pending write to // SCC has landed @@ -1451,9 +1496,9 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) { assert(ST->hasVMemToLDSLoad()); LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II - << "Before: " << Wait;); - ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); - LLVM_DEBUG(dbgs() << "After: " << Wait;); + << "Before: " << Wait << '\n';); + ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait); + LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';); // It is possible (but unlikely) that this is the only wait instruction, // in which case, we exit this loop without a WaitcntInstr to consume @@ -1957,19 +2002,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); if (CallAddrOp.isReg()) { - RegInterval CallAddrOpInterval = - ScoreBrackets.getRegInterval(&MI, CallAddrOp); - - ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval, - Wait); + ScoreBrackets.determineWaitForPhysReg( + SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait); if (const auto *RtnAddrOp = TII->getNamedOperand(MI, AMDGPU::OpName::dst)) { - RegInterval RtnAddrOpInterval = - ScoreBrackets.getRegInterval(&MI, *RtnAddrOp); - - ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval, - Wait); + ScoreBrackets.determineWaitForPhysReg( + SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait); } } } else if (Opc == AMDGPU::S_BARRIER_WAIT) { @@ -2006,27 +2045,27 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, continue; // LOAD_CNT is only relevant to vgpr or LDS. - unsigned RegNo = FIRST_LDS_VGPR; + unsigned TID = LDSDMA_BEGIN; if (Ptr && Memop->getAAInfo()) { const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { - if ((I + 1) >= NUM_LDS_VGPRS) { + if ((I + 1) >= NUM_LDSDMA) { // We didn't have enough slot to track this LDS DMA store, it // has been tracked using the common RegNo (FIRST_LDS_VGPR). - ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait); break; } - ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait); + ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait); } } } else { - ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait); + ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait); + } + if (Memop->isStore()) { + ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait); } - - if (Memop->isStore()) - ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); } // Loop over use and def operands. @@ -2038,7 +2077,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) continue; - RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op); + MCPhysReg Reg = Op.getReg().asMCReg(); const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); if (IsVGPR) { @@ -2057,28 +2096,27 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Additionally check instructions where Point Sample Acceleration // might be applied. if (Op.isUse() || !updateVMCntOnly(MI) || - ScoreBrackets.hasOtherPendingVmemTypes(Interval, - getVmemType(MI)) || - ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) || + ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) || + ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) || !ST->hasVmemWriteVgprInOrder()) { - ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait); - ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait); - ScoreBrackets.determineWait(BVH_CNT, Interval, Wait); - ScoreBrackets.clearVgprVmemTypes(Interval); + ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait); + ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait); + ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait); + ScoreBrackets.clearVgprVmemTypes(Reg); } if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) { - ScoreBrackets.determineWait(EXP_CNT, Interval, Wait); + ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait); } - ScoreBrackets.determineWait(DS_CNT, Interval, Wait); + ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait); } else if (Op.getReg() == AMDGPU::SCC) { - ScoreBrackets.determineWait(KM_CNT, Interval, Wait); + ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait); } else { - ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait); + ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait); } if (ST->hasWaitXCnt() && Op.isDef()) - ScoreBrackets.determineWait(X_CNT, Interval, Wait); + ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait); } } } @@ -2385,8 +2423,12 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { bool StrictDom = false; - VgprUB = std::max(VgprUB, Other.VgprUB); - SgprUB = std::max(SgprUB, Other.SgprUB); + // Check if "other" has keys we don't have, and create default entries for + // those. If they remain empty after merging, we will clean it up after. + for (auto K : Other.VMem.keys()) + VMem.try_emplace(K); + for (auto K : Other.SGPRs.keys()) + SGPRs.try_emplace(K); for (auto T : inst_counter_types(Context->MaxCounter)) { // Merge event flags for this counter @@ -2429,23 +2471,29 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { } } - for (int J = 0; J <= VgprUB; J++) - StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); + for (auto &[RegID, Info] : VMem) + StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T)); if (isSmemCounter(T)) { unsigned Idx = getSgprScoresIdx(T); - for (int J = 0; J <= SgprUB; J++) - StrictDom |= - mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]); + for (auto &[RegID, Info] : SGPRs) { + auto It = Other.SGPRs.find(RegID); + unsigned OtherScore = + (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0; + StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore); + } } } - for (int J = 0; J <= VgprUB; J++) { - unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; - StrictDom |= NewVmemTypes != VgprVmemTypes[J]; - VgprVmemTypes[J] = NewVmemTypes; + for (auto &[TID, Info] : VMem) { + if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) { + unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes; + StrictDom |= NewVmemTypes != Info.VMEMTypes; + Info.VMEMTypes = NewVmemTypes; + } } + purgeEmptyTrackingData(); return StrictDom; } @@ -2656,8 +2704,8 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, bool HasVMemLoad = false; bool HasVMemStore = false; bool UsesVgprLoadedOutside = false; - DenseSet VgprUse; - DenseSet VgprDef; + DenseSet VgprUse; + DenseSet VgprDef; for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { @@ -2669,21 +2717,21 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, for (const MachineOperand &Op : MI.all_uses()) { if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg())) continue; - RegInterval Interval = Brackets.getRegInterval(&MI, Op); // Vgpr use - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) { // If we find a register that is loaded inside the loop, 1. and 2. // are invalidated and we can exit. - if (VgprDef.contains(RegNo)) + if (VgprDef.contains(RU)) return false; - VgprUse.insert(RegNo); + VgprUse.insert(RU); // If at least one of Op's registers is in the score brackets, the // value is likely loaded outside of the loop. - if (Brackets.getRegScore(RegNo, LOAD_CNT) > + VMEMID ID = toVMEMID(RU); + if (Brackets.getVMemScore(ID, LOAD_CNT) > Brackets.getScoreLB(LOAD_CNT) || - Brackets.getRegScore(RegNo, SAMPLE_CNT) > + Brackets.getVMemScore(ID, SAMPLE_CNT) > Brackets.getScoreLB(SAMPLE_CNT) || - Brackets.getRegScore(RegNo, BVH_CNT) > + Brackets.getVMemScore(ID, BVH_CNT) > Brackets.getScoreLB(BVH_CNT)) { UsesVgprLoadedOutside = true; break; @@ -2694,13 +2742,12 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, // VMem load vgpr def if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) { for (const MachineOperand &Op : MI.all_defs()) { - RegInterval Interval = Brackets.getRegInterval(&MI, Op); - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) { // If we find a register that is loaded inside the loop, 1. and 2. // are invalidated and we can exit. - if (VgprUse.contains(RegNo)) + if (VgprUse.contains(RU)) return false; - VgprDef.insert(RegNo); + VgprDef.insert(RU); } } } @@ -2779,12 +2826,6 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); Limits.XcntMax = AMDGPU::getXcntBitMask(IV); - [[maybe_unused]] unsigned NumVGPRsMax = - ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()); - [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); - assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); - assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); - BlockInfos.clear(); bool Modified = false; diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-merge.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-merge.ll new file mode 100644 index 0000000000000..19d741db1c612 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-merge.ll @@ -0,0 +1,168 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -stop-after=si-insert-waitcnts < %s | FileCheck %s + +; Testcase reduced from Blender 4.1 where we generated incorrect waitcnts due to a bad +; WaitcntBrackets::merge implementation. + +%struct.bar = type { %struct.bar.0 } +%struct.bar.0 = type { float, float, float, float } + +define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1) { + ; CHECK-LABEL: name: widget + ; CHECK: bb.0.bb: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr17 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr22_sgpr23 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: $sgpr20_sgpr21 = S_MOV_B64 killed $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg1.kernarg.offset.align.down, align 8, addrspace 4) + ; CHECK-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1, align 16, addrspace 4) + ; CHECK-NEXT: $sgpr20 = S_ADD_U32 $sgpr20, killed $sgpr17, implicit-def $scc, implicit-def $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: $sgpr21 = S_ADDC_U32 $sgpr21, 0, implicit-def dead $scc, implicit killed $scc, implicit-def $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: S_WAITCNT 49279 + ; CHECK-NEXT: S_BITCMP1_B32 killed renamable $sgpr2, 0, implicit-def $scc + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 0 + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit $exec + ; CHECK-NEXT: renamable $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; CHECK-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: renamable $sgpr6_sgpr7 = IMPLICIT_DEF + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.10.loop.exit.guard: + ; CHECK-NEXT: successors: %bb.11(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 0 + ; CHECK-NEXT: renamable $sgpr6_sgpr7 = IMPLICIT_DEF + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.11, implicit killed $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.bb2: + ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_WAITCNT 3952 + ; CHECK-NEXT: renamable $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec + ; CHECK-NEXT: renamable $vgpr4_vgpr5 = V_LSHLREV_B64_e64 4, $vgpr0_vgpr1, implicit $exec + ; CHECK-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr4_vgpr5, 0, 0, implicit $exec :: (load (s32) from %ir.getelementptr, align 16, addrspace 1) + ; CHECK-NEXT: renamable $sgpr6_sgpr7 = S_OR_B64 killed renamable $sgpr6_sgpr7, $exec, implicit-def dead $scc + ; CHECK-NEXT: S_WAITCNT 3952 + ; CHECK-NEXT: V_CMP_GT_I32_e32 1, killed $vgpr0, implicit-def $vcc, implicit $exec + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.12.bb13: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) null`, addrspace 1) + ; CHECK-NEXT: renamable $sgpr6_sgpr7 = S_ANDN2_B64 killed renamable $sgpr6_sgpr7, $exec, implicit-def dead $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.Flow3: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc + ; CHECK-NEXT: renamable $sgpr8_sgpr9 = S_AND_B64 $exec, renamable $sgpr6_sgpr7, implicit-def $scc + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr8_sgpr9, killed renamable $sgpr4_sgpr5, implicit-def $scc + ; CHECK-NEXT: $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.bb3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc + ; CHECK-NEXT: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr20_sgpr21_sgpr22_sgpr23, 0, 0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(5) null`, addrspace 5) + ; CHECK-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) null`, addrspace 1) + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9.Flow2: + ; CHECK-NEXT: successors: %bb.10(0x04000000), %bb.4(0x7c000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr4, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def dead $scc + ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.10, implicit killed $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.bb6: + ; CHECK-NEXT: successors: %bb.6(0x40000000), %bb.5(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr4, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_WAITCNT 3952 + ; CHECK-NEXT: renamable $sgpr6_sgpr7 = V_CMP_EQ_U32_e64 0, killed $vgpr4, implicit $exec + ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr6_sgpr7, implicit-def dead $scc + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.5, implicit killed $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.bb9: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr1, renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.arg.load, addrspace 1) + ; CHECK-NEXT: renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) null`, addrspace 1) + ; CHECK-NEXT: renamable $sgpr6_sgpr7 = S_MOV_B64 -1 + ; CHECK-NEXT: S_BRANCH %bb.7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr6_sgpr7, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.Flow: + ; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.9(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr4, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr6_sgpr7, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 -1 + ; CHECK-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr6_sgpr7 = S_MOV_B64 -1 + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.9, implicit killed $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8.bb11: + ; CHECK-NEXT: successors: %bb.9(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr4, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 0 + ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr2_sgpr3 + ; CHECK-NEXT: S_BRANCH %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.11.DummyReturnBlock: + ; CHECK-NEXT: liveins: $sgpr20_sgpr21_sgpr22_sgpr23 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_ENDPGM 0 +bb: + br label %bb2 + +bb2: ; preds = %bb13, %bb11, %bb + %phi = phi i32 [ 0, %bb ], [ %load14, %bb13 ], [ %load4, %bb11 ] + %xor = xor i32 %phi, 1 + %zext = zext i32 %xor to i64 + %getelementptr = getelementptr %struct.bar, ptr addrspace(1) null, i64 %zext + %load = load i32, ptr addrspace(1) %getelementptr, align 16 + %icmp = icmp sgt i32 %load, 0 + br i1 %icmp, label %bb3, label %bb13 + +bb3: ; preds = %bb2 + %load4 = load i32, ptr addrspace(5) null, align 4 + %load5 = load i32, ptr addrspace(1) null, align 4 + br label %bb6 + +bb6: ; preds = %bb11, %bb3 + %phi7 = phi i32 [ %load5, %bb3 ], [ %phi12, %bb11 ] + %icmp8 = icmp eq i32 %phi7, 0 + br i1 %icmp8, label %bb11, label %bb9 + +bb9: ; preds = %bb6 + store i32 0, ptr addrspace(1) %arg, align 4 + %load10 = load i32, ptr addrspace(1) null, align 4 + br label %bb11 + +bb11: ; preds = %bb9, %bb6 + %phi12 = phi i32 [ 0, %bb6 ], [ %load10, %bb9 ] + br i1 %arg1, label %bb2, label %bb6 + +bb13: ; preds = %bb2 + %load14 = load i32, ptr addrspace(1) null, align 4 + br label %bb2 +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll index 74513ec9106bc..e759cde92594e 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll @@ -156,8 +156,6 @@ main_body: ret void } -; There are 8 pseudo registers defined to track LDS DMA dependencies. - define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) { ; GFX9-LABEL: buffer_load_lds_dword_10_arrays: ; GFX9: ; %bb.0: ; %main_body @@ -223,9 +221,10 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: ds_read_b32 v7, v9 offset:1792 ; GFX9-NEXT: ; wave barrier -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: ds_read_b32 v8, v9 offset:2048 ; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_read_b32 v9, v9 offset:2304 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] @@ -289,9 +288,10 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: ds_read_b32 v7, v9 offset:1792 ; GFX10-NEXT: ; wave barrier -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: ds_read_b32 v8, v9 offset:2048 ; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_read_b32 v9, v9 offset:2304 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]