diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 3d6fc309c7cf4..7c0525b9c9957 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -97,7 +97,40 @@ auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
   return enum_seq(LOAD_CNT, MaxCounter);
 }
 
-using RegInterval = std::pair<int, int>;
+/// Integer IDs used to track vector memory locations we may have to wait on.
+/// Encoded as u16 chunks:
+///
+///   [0,               REGUNITS_END ): MCRegUnit
+///   [LDSDMA_BEGIN,    LDSDMA_END  ) : LDS DMA IDs
+///
+/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
+/// It gives (2 << 16) - 1 entries per category which is more than enough
+/// for all register units. MCPhysReg is u16 so we don't even support >u16
+/// physical register numbers at this time, let alone >u16 register units.
+/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
+/// is enough for all register units.
+using VMEMID = uint32_t;
+
+enum : VMEMID {
+  TRACKINGID_RANGE_LEN = (1 << 16),
+
+  // Important: MCRegUnits must always be tracked starting from 0, as we
+  // need to be able to convert between a MCRegUnit and a VMEMID freely.
+  REGUNITS_BEGIN = 0,
+  REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
+
+  // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
+  // entry, which is updated for all LDS DMA operations encountered.
+  // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
+  NUM_LDSDMA = TRACKINGID_RANGE_LEN,
+  LDSDMA_BEGIN = REGUNITS_END,
+  LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
+};
+
+/// Convert a MCRegUnit to a VMEMID.
+static constexpr VMEMID toVMEMID(MCRegUnit RU) {
+  return static_cast<unsigned>(RU);
+}
 
 struct HardwareLimits {
   unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
@@ -146,30 +179,6 @@ static constexpr StringLiteral WaitEventTypeName[] = {
 #undef AMDGPU_EVENT_NAME
 // clang-format on
 
-// The mapping is:
-//  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
-//  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
-//  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
-//  NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS ..                    SCC
-// We reserve a fixed number of VGPR slots in the scoring tables for
-// special tokens like SCMEM_LDS (needed for buffer load to LDS).
-enum RegisterMapping {
-  SQ_MAX_PGM_VGPRS = 2048, // Maximum programmable VGPRs across all targets.
-  AGPR_OFFSET = 512,       // Maximum programmable ArchVGPRs across all targets.
-  SQ_MAX_PGM_SGPRS = 128,  // Maximum programmable SGPRs across all targets.
-  // Artificial register slots to track LDS writes into specific LDS locations
-  // if a location is known. When slots are exhausted or location is
-  // unknown use the first slot. The first slot is also always updated in
-  // addition to known location's slot to properly generate waits if dependent
-  // instruction's location is unknown.
-  FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, // Extra slots for LDS stores.
-  NUM_LDS_VGPRS = 9,                 // One more than the stores we track.
-  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, // Where SGPRs start.
-  NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
-  // Remaining non-allocatable registers
-  SCC = NUM_ALL_ALLOCATABLE
-};
-
 // Enumerate different types of result-returning VMEM operations. Although
 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
 // s_waitcnt only instructions of the same VmemType are guaranteed to write
@@ -585,7 +594,30 @@ class SIInsertWaitcnts {
 // "s_waitcnt 0" before use.
 class WaitcntBrackets {
 public:
-  WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
+  WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
+    assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
+  }
+
+#ifndef NDEBUG
+  ~WaitcntBrackets() {
+    unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
+    for (auto &[ID, Val] : VMem) {
+      if (Val.empty())
+        ++NumUnusedVmem;
+    }
+    for (auto &[ID, Val] : SGPRs) {
+      if (Val.empty())
+        ++NumUnusedSGPRs;
+    }
+
+    if (NumUnusedVmem || NumUnusedSGPRs) {
+      errs() << "WaitcntBracket had unused entries at destruction time: "
+             << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
+             << " SGPR unused entries\n";
+      std::abort();
+    }
+  }
+#endif
 
   bool isSmemCounter(InstCounterType T) const {
     return T == Context->SmemAccessCounter || T == X_CNT;
@@ -610,22 +642,18 @@ class WaitcntBrackets {
     return getScoreUB(T) - getScoreLB(T);
   }
 
-  unsigned getRegScore(int GprNo, InstCounterType T) const {
-    if (GprNo < NUM_ALL_VGPRS)
-      return VgprScores[T][GprNo];
-
-    if (GprNo < NUM_ALL_ALLOCATABLE)
-      return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+  unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
+    auto It = SGPRs.find(RU);
+    return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
+  }
 
-    assert(GprNo == SCC);
-    return SCCScore;
+  unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
+    auto It = VMem.find(TID);
+    return It != VMem.end() ? It->second.Scores[T] : 0;
   }
 
   bool merge(const WaitcntBrackets &Other);
 
-  RegInterval getRegInterval(const MachineInstr *MI,
-                             const MachineOperand &Op) const;
-
   bool counterOutOfOrder(InstCounterType T) const;
   void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
   void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
@@ -633,12 +661,10 @@ class WaitcntBrackets {
   bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
   void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);
 
-  void determineWait(InstCounterType T, RegInterval Interval,
-                     AMDGPU::Waitcnt &Wait) const;
-  void determineWait(InstCounterType T, int RegNo,
-                     AMDGPU::Waitcnt &Wait) const {
-    determineWait(T, {RegNo, RegNo + 1}, Wait);
-  }
+  void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
+                               AMDGPU::Waitcnt &Wait) const;
+  void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
+                              AMDGPU::Waitcnt &Wait) const;
   void tryClearSCCWriteEvent(MachineInstr *Inst);
 
   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
@@ -686,19 +712,22 @@ class WaitcntBrackets {
 
   // Return true if there might be pending writes to the vgpr-interval by VMEM
   // instructions with types different from V.
-  bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
-    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-      assert(RegNo < NUM_ALL_VGPRS);
-      if (VgprVmemTypes[RegNo] & ~(1 << V))
+  bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
+    for (MCRegUnit RU : regunits(Reg)) {
+      auto It = VMem.find(toVMEMID(RU));
+      if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
         return true;
     }
     return false;
   }
 
-  void clearVgprVmemTypes(RegInterval Interval) {
-    for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-      assert(RegNo < NUM_ALL_VGPRS);
-      VgprVmemTypes[RegNo] = 0;
+  void clearVgprVmemTypes(MCPhysReg Reg) {
+    for (MCRegUnit RU : regunits(Reg)) {
+      if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
+        It->second.VMEMTypes = 0;
+        if (It->second.empty())
+          VMem.erase(It);
+      }
     }
   }
 
@@ -714,11 +743,15 @@ class WaitcntBrackets {
 
   bool hasPointSampleAccel(const MachineInstr &MI) const;
   bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
-                                      RegInterval Interval) const;
+                                      MCPhysReg RU) const;
 
   void print(raw_ostream &) const;
   void dump() const { print(dbgs()); }
 
+  // Free up memory by removing empty entries from the DenseMap that track event
+  // scores.
+  void purgeEmptyTrackingData();
+
 private:
   struct MergeInfo {
     unsigned OldLB;
@@ -726,9 +759,24 @@ class WaitcntBrackets {
     unsigned MyShift;
     unsigned OtherShift;
   };
+
+  void determineWaitForScore(InstCounterType T, unsigned Score,
+                             AMDGPU::Waitcnt &Wait) const;
+
   static bool mergeScore(const MergeInfo &M, unsigned &Score,
                          unsigned OtherScore);
 
+  iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {
+    assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
+    if (!Context->TRI->isInAllocatableClass(Reg))
+      return {{}, {}};
+    const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
+    unsigned Size = Context->TRI->getRegSizeInBits(*RC);
+    if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
+      Reg = Context->TRI->get32BitRegister(Reg);
+    return Context->TRI->regunits(Reg);
+  }
+
   void setScoreLB(InstCounterType T, unsigned Val) {
     assert(T < NUM_INST_CNTS);
     ScoreLBs[T] = Val;
@@ -745,15 +793,28 @@ class WaitcntBrackets {
       ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
   }
 
-  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
-    setScoreByInterval({GprNo, GprNo + 1}, T, Val);
+  void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
+    const SIRegisterInfo *TRI = Context->TRI;
+    if (Reg == AMDGPU::SCC) {
+      SCCScore = Val;
+    } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
+      for (MCRegUnit RU : regunits(Reg))
+        VMem[toVMEMID(RU)].Scores[T] = Val;
+    } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
+      auto STy = getSgprScoresIdx(T);
+      for (MCRegUnit RU : regunits(Reg))
+        SGPRs[RU].Scores[STy] = Val;
+    } else {
+      llvm_unreachable("Register cannot be tracked/unknown register!");
+    }
   }
 
-  void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
-                          unsigned Score);
+  void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
+    VMem[TID].Scores[T] = Val;
+  }
 
-  void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op,
-                         InstCounterType CntTy, unsigned Val);
+  void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
+                         unsigned Val);
 
   const SIInsertWaitcnts *Context;
 
@@ -764,26 +825,52 @@ class WaitcntBrackets {
   unsigned LastFlat[NUM_INST_CNTS] = {0};
   // Remember the last GDS operation.
   unsigned LastGDS = 0;
-  // wait_cnt scores for every vgpr.
-  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
-  int VgprUB = -1;
-  int SgprUB = -1;
-  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
-  // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
-  // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
-  // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
-  // X_CNT score.
-  unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+
+  // The score tracking logic is fragmented as follows:
+  // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
+  // - SGPRs: SGPR RegUnits
+  // - SCC: Non-allocatable and not general purpose: not a SGPR.
+  //
+  // For the VMem case, if the key is within the range of LDS DMA IDs,
+  // then the corresponding index into the `LDSDMAStores` vector below is:
+  //   Key - LDSDMA_BEGIN - 1
+  // This is because LDSDMA_BEGIN is a generic entry and does not have an
+  // associated MachineInstr.
+  //
+  // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
+
+  struct VMEMInfo {
+    // Scores for all instruction counters.
+    std::array<unsigned, NUM_INST_CNTS> Scores = {0};
+    // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
+    unsigned VMEMTypes = 0;
+
+    bool empty() const {
+      return all_of(Scores, [](unsigned K) { return K == 0; }) && !VMEMTypes;
+    }
+  };
+
+  struct SGPRInfo {
+    // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+    // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+    // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
+    // the X_CNT score.
+    std::array<unsigned, 2> Scores = {0};
+
+    bool empty() const { return !Scores[0] && !Scores[1]; }
+  };
+
+  DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
+  DenseMap<MCRegUnit, SGPRInfo> SGPRs;
+
   // Reg score for SCC.
   unsigned SCCScore = 0;
   // The unique instruction that has an SCC write pending, if there is one.
   const MachineInstr *PendingSCCWrite = nullptr;
-  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
-  // write to each vgpr.
-  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+
   // Store representative LDS DMA operations. The only useful info here is
   // alias info. One store is kept per unique AAInfo.
-  SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
+  SmallVector<const MachineInstr *> LDSDMAStores;
 };
 
 class SIInsertWaitcntsLegacy : public MachineFunctionPass {
@@ -809,82 +896,9 @@ class SIInsertWaitcntsLegacy : public MachineFunctionPass {
 
 } // end anonymous namespace
 
-RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
-                                            const MachineOperand &Op) const {
-  if (Op.getReg() == AMDGPU::SCC)
-    return {SCC, SCC + 1};
-
-  const SIRegisterInfo *TRI = Context->TRI;
-  const MachineRegisterInfo *MRI = Context->MRI;
-
-  if (!TRI->isInAllocatableClass(Op.getReg()))
-    return {-1, -1};
-
-  // A use via a PW operand does not need a waitcnt.
-  // A partial write is not a WAW.
-  assert(!Op.getSubReg() || !Op.isUndef());
-
-  RegInterval Result;
-
-  MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST);
-  unsigned RegIdx = TRI->getHWRegIndex(MCReg);
-
-  const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
-  unsigned Size = TRI->getRegSizeInBits(*RC);
-
-  // AGPRs/VGPRs are tracked every 16 bits, SGPRs by 32 bits
-  if (TRI->isVectorRegister(*MRI, Op.getReg())) {
-    unsigned Reg = RegIdx << 1 | (AMDGPU::isHi16Reg(MCReg, *TRI) ? 1 : 0);
-    assert(!Context->ST->hasMAIInsts() || Reg < AGPR_OFFSET);
-    Result.first = Reg;
-    if (TRI->isAGPR(*MRI, Op.getReg()))
-      Result.first += AGPR_OFFSET;
-    assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
-    assert(Size % 16 == 0);
-    Result.second = Result.first + (Size / 16);
-
-    if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) {
-      // Regardless of which lo16/hi16 is used, consider the full 32-bit
-      // register used.
-      if (AMDGPU::isHi16Reg(MCReg, *TRI))
-        Result.first -= 1;
-      else
-        Result.second += 1;
-    }
-  } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
-    // SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
-    // sources like SRC_PRIVATE_BASE.
-    Result.first = RegIdx + NUM_ALL_VGPRS;
-    Result.second = Result.first + divideCeil(Size, 32);
-  } else {
-    return {-1, -1};
-  }
-
-  return Result;
-}
-
-void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
-                                         InstCounterType CntTy,
-                                         unsigned Score) {
-  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-    if (RegNo < NUM_ALL_VGPRS) {
-      VgprUB = std::max(VgprUB, RegNo);
-      VgprScores[CntTy][RegNo] = Score;
-    } else if (RegNo < NUM_ALL_ALLOCATABLE) {
-      SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
-      SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
-    } else {
-      assert(RegNo == SCC);
-      SCCScore = Score;
-    }
-  }
-}
-
-void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
-                                        const MachineOperand &Op,
+void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
                                         InstCounterType CntTy, unsigned Score) {
-  RegInterval Interval = getRegInterval(MI, Op);
-  setScoreByInterval(Interval, CntTy, Score);
+  setRegScore(Op.getReg().asMCReg(), CntTy, Score);
 }
 
 // Return true if the subtarget is one that enables Point Sample Acceleration
@@ -907,12 +921,12 @@ bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
 // one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
 // (this is the type that a point sample accelerated instruction effectively
 // becomes)
-bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
-    const MachineInstr &MI, RegInterval Interval) const {
+bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
+                                                     MCPhysReg Reg) const {
   if (!hasPointSampleAccel(MI))
     return false;
 
-  return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
+  return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
 }
 
 void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
@@ -940,57 +954,52 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
       // All GDS operations must protect their address register (same as
       // export.)
       if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
-        setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore);
+        setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
 
       if (Inst.mayStore()) {
         if (const auto *Data0 =
                 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
-          setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore);
+          setScoreByOperand(*Data0, EXP_CNT, CurrScore);
         if (const auto *Data1 =
                 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
-          setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore);
+          setScoreByOperand(*Data1, EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
         for (const MachineOperand &Op : Inst.all_uses()) {
           if (TRI->isVectorRegister(*MRI, Op.getReg()))
-            setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
+            setScoreByOperand(Op, EXP_CNT, CurrScore);
         }
       }
     } else if (TII->isFLAT(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(&Inst,
-                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(&Inst,
-                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
     } else if (TII->isMIMG(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
+        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(&Inst,
-                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
     } else if (TII->isMTBUF(Inst)) {
       if (Inst.mayStore())
-        setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
+        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
     } else if (TII->isMUBUF(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
+        setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(&Inst,
-                          *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
     } else if (TII->isLDSDIR(Inst)) {
       // LDSDIR instructions attach the score to the destination.
-      setScoreByOperand(&Inst,
-                        *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
+      setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
                         EXP_CNT, CurrScore);
     } else {
       if (TII->isEXP(Inst)) {
@@ -1000,13 +1009,13 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
         // score.
         for (MachineOperand &DefMO : Inst.all_defs()) {
           if (TRI->isVGPR(*MRI, DefMO.getReg())) {
-            setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore);
+            setScoreByOperand(DefMO, EXP_CNT, CurrScore);
           }
         }
       }
       for (const MachineOperand &Op : Inst.all_uses()) {
         if (TRI->isVectorRegister(*MRI, Op.getReg()))
-          setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
+          setScoreByOperand(Op, EXP_CNT, CurrScore);
       }
     }
   } else if (T == X_CNT) {
@@ -1020,7 +1029,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
       PendingEvents &= ~(1 << OtherEvent);
     }
     for (const MachineOperand &Op : Inst.all_uses())
-      setScoreByOperand(&Inst, Op, T, CurrScore);
+      setScoreByOperand(Op, T, CurrScore);
   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
     // Match the score to the destination registers.
     //
@@ -1032,9 +1041,8 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
     // Special cases where implicit register defs exists, such as M0 or VCC,
     // but none with memory instructions.
     for (const MachineOperand &Op : Inst.defs()) {
-      RegInterval Interval = getRegInterval(&Inst, Op);
       if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
-        if (Interval.first >= NUM_ALL_VGPRS)
+        if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
           continue;
         if (updateVMCntOnly(Inst)) {
           // updateVMCntOnly should only leave us with VGPRs
@@ -1047,16 +1055,20 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
           // this with another potential dependency
           if (hasPointSampleAccel(Inst))
             TypesMask |= 1 << VMEM_NOSAMPLER;
-          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
-            VgprVmemTypes[RegNo] |= TypesMask;
+          for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
+            VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
         }
       }
-      setScoreByInterval(Interval, T, CurrScore);
+      setScoreByOperand(Op, T, CurrScore);
     }
     if (Inst.mayStore() &&
         (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
       // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
       // written can be accessed. A load from LDS to VMEM does not need a wait.
+      //
+      // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
+      // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
+      // store. The "Slot" is the index into LDSDMAStores + 1.
       unsigned Slot = 0;
       for (const auto *MemOp : Inst.memoperands()) {
         if (!MemOp->isStore() ||
@@ -1069,9 +1081,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
         // original memory object and practically produced in the module LDS
         // lowering pass. If there is no scope available we will not be able
         // to disambiguate LDS aliasing as after the module lowering all LDS
-        // is squashed into a single big object. Do not attempt to use one of
-        // the limited LDSDMAStores for something we will not be able to use
-        // anyway.
+        // is squashed into a single big object.
         if (!AAI || !AAI.Scope)
           break;
         for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
@@ -1084,21 +1094,20 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
         }
         if (Slot)
           break;
-        // The slot may not be valid because it can be >= NUM_LDS_VGPRS which
+        // The slot may not be valid because it can be >= NUM_LDSDMA which
         // means the scoreboard cannot track it. We still want to preserve the
         // MI in order to check alias information, though.
         LDSDMAStores.push_back(&Inst);
         Slot = LDSDMAStores.size();
         break;
       }
-      if (Slot < NUM_LDS_VGPRS)
-        setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
-      if (Slot)
-        setRegScore(FIRST_LDS_VGPR, T, CurrScore);
+      setVMemScore(LDSDMA_BEGIN, T, CurrScore);
+      if (Slot && Slot < NUM_LDSDMA)
+        setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
     }
 
     if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
-      setRegScore(SCC, T, CurrScore);
+      setRegScore(AMDGPU::SCC, T, CurrScore);
       PendingSCCWrite = &Inst;
     }
   }
@@ -1148,27 +1157,36 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
       // Print vgpr scores.
       unsigned LB = getScoreLB(T);
 
-      for (int J = 0; J <= VgprUB; J++) {
-        unsigned RegScore = getRegScore(J, T);
+      SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
+      sort(SortedVMEMIDs);
+
+      for (auto ID : SortedVMEMIDs) {
+        unsigned RegScore = VMem.at(ID).Scores[T];
         if (RegScore <= LB)
           continue;
         unsigned RelScore = RegScore - LB - 1;
-        if (J < FIRST_LDS_VGPR) {
-          OS << ' ' << RelScore << ":v" << J;
+        if (ID < REGUNITS_END) {
+          OS << ' ' << RelScore << ":vRU" << ID;
         } else {
-          OS << ' ' << RelScore << ":ds";
+          assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
+                 "Unhandled/unexpected ID value!");
+          OS << ' ' << RelScore << ":LDSDMA" << ID;
         }
       }
+
       // Also need to print sgpr scores for lgkm_cnt or xcnt.
       if (isSmemCounter(T)) {
-        for (int J = 0; J <= SgprUB; J++) {
-          unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
+        SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
+        sort(SortedSMEMIDs);
+        for (auto ID : SortedSMEMIDs) {
+          unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];
           if (RegScore <= LB)
             continue;
           unsigned RelScore = RegScore - LB - 1;
-          OS << ' ' << RelScore << ":s" << J;
+          OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
         }
       }
+
       if (T == KM_CNT && SCCScore > 0)
         OS << ' ' << SCCScore << ":scc";
     }
@@ -1213,38 +1231,65 @@ void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
     Count = ~0u;
 }
 
-void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
-                                    AMDGPU::Waitcnt &Wait) const {
+void WaitcntBrackets::purgeEmptyTrackingData() {
+  for (auto &[K, V] : make_early_inc_range(VMem)) {
+    if (V.empty())
+      VMem.erase(K);
+  }
+  for (auto &[K, V] : make_early_inc_range(SGPRs)) {
+    if (V.empty())
+      SGPRs.erase(K);
+  }
+}
+
+void WaitcntBrackets::determineWaitForScore(InstCounterType T,
+                                            unsigned ScoreToWait,
+                                            AMDGPU::Waitcnt &Wait) const {
   const unsigned LB = getScoreLB(T);
   const unsigned UB = getScoreUB(T);
-  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-    unsigned ScoreToWait = getRegScore(RegNo, T);
-
-    // If the score of src_operand falls within the bracket, we need an
-    // s_waitcnt instruction.
-    if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
-      if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
-          !Context->ST->hasFlatLgkmVMemCountInOrder()) {
-        // If there is a pending FLAT operation, and this is a VMem or LGKM
-        // waitcnt and the target can report early completion, then we need
-        // to force a waitcnt 0.
-        addWait(Wait, T, 0);
-      } else if (counterOutOfOrder(T)) {
-        // Counter can get decremented out-of-order when there
-        // are multiple types event in the bracket. Also emit an s_wait counter
-        // with a conservative value of 0 for the counter.
-        addWait(Wait, T, 0);
-      } else {
-        // If a counter has been maxed out avoid overflow by waiting for
-        // MAX(CounterType) - 1 instead.
-        unsigned NeededWait =
-            std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
-        addWait(Wait, T, NeededWait);
-      }
+
+  // If the score falls within the bracket, we need a waitcnt.
+  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
+    if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
+        !Context->ST->hasFlatLgkmVMemCountInOrder()) {
+      // If there is a pending FLAT operation, and this is a VMem or LGKM
+      // waitcnt and the target can report early completion, then we need
+      // to force a waitcnt 0.
+      addWait(Wait, T, 0);
+    } else if (counterOutOfOrder(T)) {
+      // Counter can get decremented out-of-order when there
+      // are multiple types event in the bracket. Also emit an s_wait counter
+      // with a conservative value of 0 for the counter.
+      addWait(Wait, T, 0);
+    } else {
+      // If a counter has been maxed out avoid overflow by waiting for
+      // MAX(CounterType) - 1 instead.
+      unsigned NeededWait =
+          std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
+      addWait(Wait, T, NeededWait);
     }
   }
 }
 
+void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
+                                              AMDGPU::Waitcnt &Wait) const {
+  if (Reg == AMDGPU::SCC) {
+    determineWaitForScore(T, SCCScore, Wait);
+  } else {
+    bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
+    for (MCRegUnit RU : regunits(Reg))
+      determineWaitForScore(
+          T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
+          Wait);
+  }
+}
+
+void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
+                                             AMDGPU::Waitcnt &Wait) const {
+  assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
+  determineWaitForScore(T, getVMemScore(TID, T), Wait);
+}
+
 void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
   // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
   // SCC has landed
@@ -1451,9 +1496,9 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
     } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
       assert(ST->hasVMemToLDSLoad());
       LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
-                        << "Before: " << Wait;);
-      ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
-      LLVM_DEBUG(dbgs() << "After: " << Wait;);
+                        << "Before: " << Wait << '\n';);
+      ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
+      LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
 
       // It is possible (but unlikely) that this is the only wait instruction,
       // in which case, we exit this loop without a WaitcntInstr to consume
@@ -1957,19 +2002,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
 
       const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
       if (CallAddrOp.isReg()) {
-        RegInterval CallAddrOpInterval =
-            ScoreBrackets.getRegInterval(&MI, CallAddrOp);
-
-        ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
-                                    Wait);
+        ScoreBrackets.determineWaitForPhysReg(
+            SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
 
         if (const auto *RtnAddrOp =
                 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
-          RegInterval RtnAddrOpInterval =
-              ScoreBrackets.getRegInterval(&MI, *RtnAddrOp);
-
-          ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
-                                      Wait);
+          ScoreBrackets.determineWaitForPhysReg(
+              SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
         }
       }
     } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
@@ -2006,27 +2045,27 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           continue;
 
         // LOAD_CNT is only relevant to vgpr or LDS.
-        unsigned RegNo = FIRST_LDS_VGPR;
+        unsigned TID = LDSDMA_BEGIN;
         if (Ptr && Memop->getAAInfo()) {
           const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
           for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
             if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
-              if ((I + 1) >= NUM_LDS_VGPRS) {
+              if ((I + 1) >= NUM_LDSDMA) {
                 // We didn't have enough slot to track this LDS DMA store, it
                 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
-                ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
+                ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
                 break;
               }
 
-              ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
+              ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
             }
           }
         } else {
-          ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
+          ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
+        }
+        if (Memop->isStore()) {
+          ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
         }
-
-        if (Memop->isStore())
-          ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
       }
 
       // Loop over use and def operands.
@@ -2038,7 +2077,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
           continue;
 
-        RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op);
+        MCPhysReg Reg = Op.getReg().asMCReg();
 
         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
         if (IsVGPR) {
@@ -2057,28 +2096,27 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           // Additionally check instructions where Point Sample Acceleration
           // might be applied.
           if (Op.isUse() || !updateVMCntOnly(MI) ||
-              ScoreBrackets.hasOtherPendingVmemTypes(Interval,
-                                                     getVmemType(MI)) ||
-              ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
+              ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
+              ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
               !ST->hasVmemWriteVgprInOrder()) {
-            ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
-            ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
-            ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
-            ScoreBrackets.clearVgprVmemTypes(Interval);
+            ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
+            ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
+            ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
+            ScoreBrackets.clearVgprVmemTypes(Reg);
           }
 
           if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
-            ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
+            ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
           }
-          ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
+          ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
         } else if (Op.getReg() == AMDGPU::SCC) {
-          ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
+          ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
         } else {
-          ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
+          ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
         }
 
         if (ST->hasWaitXCnt() && Op.isDef())
-          ScoreBrackets.determineWait(X_CNT, Interval, Wait);
+          ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
       }
     }
   }
@@ -2385,8 +2423,12 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
   bool StrictDom = false;
 
-  VgprUB = std::max(VgprUB, Other.VgprUB);
-  SgprUB = std::max(SgprUB, Other.SgprUB);
+  // Check if "other" has keys we don't have, and create default entries for
+  // those. If they remain empty after merging, we will clean it up after.
+  for (auto K : Other.VMem.keys())
+    VMem.try_emplace(K);
+  for (auto K : Other.SGPRs.keys())
+    SGPRs.try_emplace(K);
 
   for (auto T : inst_counter_types(Context->MaxCounter)) {
     // Merge event flags for this counter
@@ -2429,23 +2471,29 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
       }
     }
 
-    for (int J = 0; J <= VgprUB; J++)
-      StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
+    for (auto &[RegID, Info] : VMem)
+      StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
 
     if (isSmemCounter(T)) {
       unsigned Idx = getSgprScoresIdx(T);
-      for (int J = 0; J <= SgprUB; J++)
-        StrictDom |=
-            mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
+      for (auto &[RegID, Info] : SGPRs) {
+        auto It = Other.SGPRs.find(RegID);
+        unsigned OtherScore =
+            (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
+        StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);
+      }
     }
   }
 
-  for (int J = 0; J <= VgprUB; J++) {
-    unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
-    StrictDom |= NewVmemTypes != VgprVmemTypes[J];
-    VgprVmemTypes[J] = NewVmemTypes;
+  for (auto &[TID, Info] : VMem) {
+    if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
+      unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
+      StrictDom |= NewVmemTypes != Info.VMEMTypes;
+      Info.VMEMTypes = NewVmemTypes;
+    }
   }
 
+  purgeEmptyTrackingData();
   return StrictDom;
 }
 
@@ -2656,8 +2704,8 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   bool HasVMemLoad = false;
   bool HasVMemStore = false;
   bool UsesVgprLoadedOutside = false;
-  DenseSet<Register> VgprUse;
-  DenseSet<Register> VgprDef;
+  DenseSet<MCRegUnit> VgprUse;
+  DenseSet<MCRegUnit> VgprDef;
 
   for (MachineBasicBlock *MBB : ML->blocks()) {
     for (MachineInstr &MI : *MBB) {
@@ -2669,21 +2717,21 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
       for (const MachineOperand &Op : MI.all_uses()) {
         if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
           continue;
-        RegInterval Interval = Brackets.getRegInterval(&MI, Op);
         // Vgpr use
-        for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
           // If we find a register that is loaded inside the loop, 1. and 2.
           // are invalidated and we can exit.
-          if (VgprDef.contains(RegNo))
+          if (VgprDef.contains(RU))
             return false;
-          VgprUse.insert(RegNo);
+          VgprUse.insert(RU);
           // If at least one of Op's registers is in the score brackets, the
           // value is likely loaded outside of the loop.
-          if (Brackets.getRegScore(RegNo, LOAD_CNT) >
+          VMEMID ID = toVMEMID(RU);
+          if (Brackets.getVMemScore(ID, LOAD_CNT) >
                   Brackets.getScoreLB(LOAD_CNT) ||
-              Brackets.getRegScore(RegNo, SAMPLE_CNT) >
+              Brackets.getVMemScore(ID, SAMPLE_CNT) >
                   Brackets.getScoreLB(SAMPLE_CNT) ||
-              Brackets.getRegScore(RegNo, BVH_CNT) >
+              Brackets.getVMemScore(ID, BVH_CNT) >
                   Brackets.getScoreLB(BVH_CNT)) {
             UsesVgprLoadedOutside = true;
             break;
@@ -2694,13 +2742,12 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
       // VMem load vgpr def
       if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
         for (const MachineOperand &Op : MI.all_defs()) {
-          RegInterval Interval = Brackets.getRegInterval(&MI, Op);
-          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
             // If we find a register that is loaded inside the loop, 1. and 2.
             // are invalidated and we can exit.
-            if (VgprUse.contains(RegNo))
+            if (VgprUse.contains(RU))
               return false;
-            VgprDef.insert(RegNo);
+            VgprDef.insert(RU);
           }
         }
       }
@@ -2779,12 +2826,6 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
   Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
   Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
 
-  [[maybe_unused]] unsigned NumVGPRsMax =
-      ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
-  [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
-  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
-  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
-
   BlockInfos.clear();
   bool Modified = false;
 
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-merge.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-merge.ll
new file mode 100644
index 0000000000000..19d741db1c612
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-merge.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -stop-after=si-insert-waitcnts < %s | FileCheck %s
+
+; Testcase reduced from Blender 4.1 where we generated incorrect waitcnts due to a bad
+; WaitcntBrackets::merge implementation.
+
+%struct.bar = type { %struct.bar.0 }
+%struct.bar.0 = type { float, float, float, float }
+
+define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1) {
+  ; CHECK-LABEL: name: widget
+  ; CHECK: bb.0.bb:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr17
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $sgpr22_sgpr23 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT:   $sgpr20_sgpr21 = S_MOV_B64 killed $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 8, 0 :: (dereferenceable invariant load (s32) from %ir.arg1.kernarg.offset.align.down, align 8, addrspace 4)
+  ; CHECK-NEXT:   renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1, align 16, addrspace 4)
+  ; CHECK-NEXT:   $sgpr20 = S_ADD_U32 $sgpr20, killed $sgpr17, implicit-def $scc, implicit-def $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT:   $sgpr21 = S_ADDC_U32 $sgpr21, 0, implicit-def dead $scc, implicit killed $scc, implicit-def $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   S_WAITCNT 49279
+  ; CHECK-NEXT:   S_BITCMP1_B32 killed renamable $sgpr2, 0, implicit-def $scc
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = S_MOV_B64 0
+  ; CHECK-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; CHECK-NEXT:   $vgpr2_vgpr3 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10.loop.exit.guard:
+  ; CHECK-NEXT:   successors: %bb.11(0x04000000), %bb.1(0x7c000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = S_MOV_B64 0
+  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.11, implicit killed $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.bb2:
+  ; CHECK-NEXT:   successors: %bb.12(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_WAITCNT 3952
+  ; CHECK-NEXT:   renamable $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr4_vgpr5 = V_LSHLREV_B64_e64 4, $vgpr0_vgpr1, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr4_vgpr5, 0, 0, implicit $exec :: (load (s32) from %ir.getelementptr, align 16, addrspace 1)
+  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = S_OR_B64 killed renamable $sgpr6_sgpr7, $exec, implicit-def dead $scc
+  ; CHECK-NEXT:   S_WAITCNT 3952
+  ; CHECK-NEXT:   V_CMP_GT_I32_e32 1, killed $vgpr0, implicit-def $vcc, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12.bb13:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) null`, addrspace 1)
+  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = S_ANDN2_B64 killed renamable $sgpr6_sgpr7, $exec, implicit-def dead $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.Flow3:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc
+  ; CHECK-NEXT:   renamable $sgpr8_sgpr9 = S_AND_B64 $exec, renamable $sgpr6_sgpr7, implicit-def $scc
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr8_sgpr9, killed renamable $sgpr4_sgpr5, implicit-def $scc
+  ; CHECK-NEXT:   $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.bb3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
+  ; CHECK-NEXT:   renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr20_sgpr21_sgpr22_sgpr23, 0, 0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(5) null`, addrspace 5)
+  ; CHECK-NEXT:   renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) null`, addrspace 1)
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9.Flow2:
+  ; CHECK-NEXT:   successors: %bb.10(0x04000000), %bb.4(0x7c000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr4, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6_sgpr7, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def dead $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCZ %bb.10, implicit killed $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.bb6:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr4, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_WAITCNT 3952
+  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = V_CMP_EQ_U32_e64 0, killed $vgpr4, implicit $exec
+  ; CHECK-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr6_sgpr7, implicit-def dead $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.5, implicit killed $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.bb9:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr1, renamable $vgpr1, renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s32) into %ir.arg.load, addrspace 1)
+  ; CHECK-NEXT:   renamable $vgpr4 = GLOBAL_LOAD_DWORD renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) null`, addrspace 1)
+  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = S_MOV_B64 -1
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr6_sgpr7, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $vgpr4 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.Flow:
+  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr4, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr6_sgpr7, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = S_MOV_B64 -1
+  ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def dead $scc
+  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = S_MOV_B64 -1
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.9, implicit killed $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8.bb11:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr4, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr0_vgpr1:0x000000000000000C, $vgpr2_vgpr3, $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = S_MOV_B64 0
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = S_MOV_B64 $sgpr2_sgpr3
+  ; CHECK-NEXT:   S_BRANCH %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11.DummyReturnBlock:
+  ; CHECK-NEXT:   liveins: $sgpr20_sgpr21_sgpr22_sgpr23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_ENDPGM 0
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb13, %bb11, %bb
+  %phi = phi i32 [ 0, %bb ], [ %load14, %bb13 ], [ %load4, %bb11 ]
+  %xor = xor i32 %phi, 1
+  %zext = zext i32 %xor to i64
+  %getelementptr = getelementptr %struct.bar, ptr addrspace(1) null, i64 %zext
+  %load = load i32, ptr addrspace(1) %getelementptr, align 16
+  %icmp = icmp sgt i32 %load, 0
+  br i1 %icmp, label %bb3, label %bb13
+
+bb3:                                              ; preds = %bb2
+  %load4 = load i32, ptr addrspace(5) null, align 4
+  %load5 = load i32, ptr addrspace(1) null, align 4
+  br label %bb6
+
+bb6:                                              ; preds = %bb11, %bb3
+  %phi7 = phi i32 [ %load5, %bb3 ], [ %phi12, %bb11 ]
+  %icmp8 = icmp eq i32 %phi7, 0
+  br i1 %icmp8, label %bb11, label %bb9
+
+bb9:                                              ; preds = %bb6
+  store i32 0, ptr addrspace(1) %arg, align 4
+  %load10 = load i32, ptr addrspace(1) null, align 4
+  br label %bb11
+
+bb11:                                             ; preds = %bb9, %bb6
+  %phi12 = phi i32 [ 0, %bb6 ], [ %load10, %bb9 ]
+  br i1 %arg1, label %bb2, label %bb6
+
+bb13:                                             ; preds = %bb2
+  %load14 = load i32, ptr addrspace(1) null, align 4
+  br label %bb2
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
index 74513ec9106bc..e759cde92594e 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
@@ -156,8 +156,6 @@ main_body:
   ret void
 }
 
-; There are 8 pseudo registers defined to track LDS DMA dependencies.
-
 define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
 ; GFX9-LABEL: buffer_load_lds_dword_10_arrays:
 ; GFX9:       ; %bb.0: ; %main_body
@@ -223,9 +221,10 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    ds_read_b32 v7, v9 offset:1792
 ; GFX9-NEXT:    ; wave barrier
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    ds_read_b32 v8, v9 offset:2048
 ; GFX9-NEXT:    ; wave barrier
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_read_b32 v9, v9 offset:2304
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
@@ -289,9 +288,10 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    ds_read_b32 v7, v9 offset:1792
 ; GFX10-NEXT:    ; wave barrier
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    ds_read_b32 v8, v9 offset:2048
 ; GFX10-NEXT:    ; wave barrier
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v9, v9 offset:2304
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]